In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, GRU, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.image import ImageDataGenerator


# Load the dataset
url = "https://raw.githubusercontent.com/mpasco/MalbehavD-V1/main/MalBehavD-V1-dataset.csv"
data = pd.read_csv(url)

In [None]:
# Assuming 'labels' is the column containing binary labels
y = data['labels']

# Drop unnecessary columns and extract API call sequences
X_sequences = data.drop(['sha256', 'labels'], axis=1).apply(lambda row: ' '.join(row.values.astype(str)), axis=1)


# Tokenize the API call sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_sequences)
# Convert text to sequences
X_padded = pad_sequences(tokenizer.texts_to_sequences(X_sequences), padding='post')


In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [None]:
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, GRU, Dense, Dropout
import tensorflow as tf
from keras.models import load_model

# Build the CNN-BiGRU hybrid model

embedding_dim = 100
cnn_filters = 256  
kernel_size = 5
bigru_units = 256 
dense_units = 512
model = Sequential()

# Word Embedding Layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=X_padded.shape[1]))

# Convolutional Layers
model.add(Conv1D(filters=cnn_filters, kernel_size=kernel_size, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=cnn_filters // 2, kernel_size=kernel_size, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))

# Bi-directional GRU Layer
model.add(Bidirectional(GRU(bigru_units, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Bidirectional(GRU(bigru_units // 2, dropout=0.5, recurrent_dropout=0.3)))

# Additional Dense Layers
model.add(Dense(dense_units, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(dense_units // 2, activation='relu'))
model.add(Dense(dense_units // 4, activation='relu')) 
model.add(Dropout(0.3))  

# Output Layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model with a smaller learning rate
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Model checkpoint callback to save the best model during training
model_checkpoint = ModelCheckpoint(filepath="best_model.h5", monitor='val_accuracy', save_best_only=True)

# Train the model with increased batch size and fewer epochs
history = model.fit(X_train, y_train, epochs=6, batch_size=256, validation_data=(X_test, y_test),
                    callbacks=[early_stopping, model_checkpoint])

# Example of loading the saved model for prediction
loaded_model = load_model("best_model.h5")

# Evaluate the loaded model
loss, accuracy = loaded_model.evaluate(X_test, y_test)
print(f"Test Accuracy (Loaded Model): {accuracy * 100:.2f}%")
