In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, GRU, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.image import ImageDataGenerator

# Load the dataset
url = "https://raw.githubusercontent.com/mpasco/MalbehavD-V1/main/MalBehavD-V1-dataset.csv"
data = pd.read_csv(url)

In [None]:
# Assuming 'labels' is the column containing binary labels
y = data['labels']

# Drop unnecessary columns and extract API call sequences
X_sequences = data.drop(['sha256', 'labels'], axis=1).apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


# Tokenize the API call sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_sequences)
# Convert text to sequences
X_padded = pad_sequences(tokenizer.texts_to_sequences(X_sequences), padding='post')


In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [None]:
# Data Augmentation (simplified)
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True
)

In [None]:
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, GRU, Dense, Dropout
from keras.models import Sequential
import tensorflow as tf

# Build the CNN-BiGRU hybrid model
embedding_dim = 100
filters = 256
kernel_size = 3
gru_units = 256

model = Sequential()

# Word Embedding Layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=X_padded.shape[1]))

# Convolutional Layer
model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Bi-directional GRU Layer
model.add(Bidirectional(GRU(gru_units, dropout=0.5, recurrent_dropout=0.5)))

# Additional Dense Layers
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))

# Output Layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model with a smaller learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model for more epochs
history = model.fit(X_train, y_train, epochs=11, batch_size=64, validation_data=(X_test, y_test))

# Save the trained model with a specific file path and name
model_save_path = "C:/Metamal-d-alert/MLmodel/model/malware_detection_model.h5"
model.save(model_save_path)
print(f"Model saved to {model_save_path}")

# loading the saved model for prediction
loaded_model = load_model(model_save_path)

# Evaluate the loaded model
loss, accuracy = loaded_model.evaluate(X_test, y_test)
print(f"Test Accuracy (Loaded Model): {accuracy * 100:.2f}%")


In [1]:
from keras.models import load_model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, GRU, Dense, Dropout
import tensorflow as tf

# Build the CNN-BiGRU hybrid model

embedding_dim = 100
filters = 128  # Reducing the number of filters
kernel_size = 5  # Increasing kernel size for capturing broader context
gru_units = 128 
model = Sequential()

# Word Embedding Layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=X_padded.shape[1]))

# Convolutional Layer
model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Bi-directional GRU Layer
model.add(Bidirectional(GRU(gru_units, dropout=0.5, recurrent_dropout=0.5)))

# Additional Dense Layers
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))

# Output Layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model with a smaller learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Model checkpoint callback to save the best model during training
model_checkpoint = ModelCheckpoint(filepath="best_model.h5", monitor='val_accuracy', save_best_only=True)

# Train the model for more epochs with callbacks for early stopping and model checkpointing
history = model.fit(X_train, y_train, epochs=8, batch_size=32, validation_data=(X_test, y_test),
                    callbacks=[early_stopping, model_checkpoint])

#  loading the saved model for prediction
loaded_model = load_model("best_model.h5")

# Evaluate the loaded model
loss, accuracy = loaded_model.evaluate(X_test, y_test)
print(f"Test Accuracy (Loaded Model): {accuracy * 100:.2f}%")






NameError: name 'tokenizer' is not defined