In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed

from keras.models import Model
from keras.optimizers import Adam
from keras.models import load_model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, LSTM, BatchNormalization, Dropout, Dense, Add, Flatten

from tensorflow.keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

2024-07-30 21:11:41.367538: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 21:11:41.418960: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 21:11:41.418998: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 21:11:41.419031: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-30 21:11:41.428254: I tensorflow/core/platform/cpu_feature_g

# Load csv from Desktop

In [2]:
NUM_CLASSES = 2
CLASSES = np.array(['Legitimate', 'Suspicious'])
DATASET_DIR = "./"
VECTOR_LENGTH = 1 * 1360

def csvToVector(file_path):
    data = pd.read_csv(file_path, header=None)
    vector = data.values.flatten()
    return vector

def process_file(class_idx, file_path):
    vector = csvToVector(file_path)
    return (vector, class_idx)

def load_data(dataset_dir):
    X = []
    y = []
    subdirs = ['benign_cms2', 'malware_cms2']
    futures = []

    with ThreadPoolExecutor() as executor:
        for class_idx, class_name in enumerate(subdirs):
            class_dir = os.path.join(dataset_dir, class_name)
            for file_name in os.listdir(class_dir):
                if file_name.endswith('.csv'):
                    file_path = os.path.join(class_dir, file_name)
                    futures.append(executor.submit(process_file, class_idx, file_path))

        for future in as_completed(futures):
            vector, class_idx = future.result()
            X.append(vector)
            y.append(class_idx)

    X = np.array(X)
    y = np.array(y)
    return X, y

In [3]:
X, y = load_data(DATASET_DIR)

In [4]:
print(X.shape)
print(y.shape)
print(X)
print(y)

(4020, 1360)
(4020,)
[[  284     0     0 ...     1   372   228]
 [  121     0     0 ...     1   210   256]
 [  172     0     0 ...     1   369   225]
 ...
 [  153     0     0 ...     0  4061   118]
 [  127     0     0 ...     3 56578   240]
 [  148     0     0 ...     0 57580   217]]
[0 0 0 ... 1 1 1]


# Train, Validation, Test Split and Nomalize

In [5]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=25)

#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train = X_train / 299.0
#X_val = X_val / 299.0
X_test = X_test / 299.0

y_train = to_categorical(y_train, 2)
#y_val = to_categorical(y_val, 2)
y_test = to_categorical(y_test, 2)

In [6]:
print(X_train.shape)
print(X_test.shape)
#print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
#print(y_val.shape)

(2814, 1360)
(1206, 1360)
(2814, 2)
(1206, 2)


# 1D CNN Architecture

In [7]:
input_layer = Input(shape=(VECTOR_LENGTH, 1))

x = LSTM(32, return_sequences=True)(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(64, return_sequences=True)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(128, return_sequences=False)(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

x = Flatten()(x)

x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

output_layer = Dense(NUM_CLASSES, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)

opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

2024-07-30 21:12:27.753695: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31350 MB memory:  -> device: 0, name: CUDA GPU, pci bus id: 0000:06:00.0, compute capability: 7.0
2024-07-30 21:12:27.754254: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 31350 MB memory:  -> device: 1, name: CUDA GPU, pci bus id: 0000:2f:00.0, compute capability: 7.0
2024-07-30 21:12:27.754767: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 31350 MB memory:  -> device: 2, name: CUDA GPU, pci bus id: 0000:86:00.0, compute capability: 7.0


In [8]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1360, 1)]         0         
                                                                 
 lstm (LSTM)                 (None, 1360, 32)          4352      
                                                                 
 batch_normalization (Batch  (None, 1360, 32)          128       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 1360, 32)          0         
                                                                 
 lstm_1 (LSTM)               (None, 1360, 64)          24832     
                                                                 
 batch_normalization_1 (Bat  (None, 1360, 64)          256       
 chNormalization)                                            

# CheckPoint

In [22]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, min_lr=0.00001)
checkpoint = ModelCheckpoint(
    filepath='/tmp/CMS2_LSTM_CheckPoint.h5',
    save_best_only=True,
    monitor='accuracy',
    mode='max',
    verbose=1
)

# Model Training

In [23]:
model.fit(X_train, y_train, epochs=100, batch_size=32, callbacks=[checkpoint])

Epoch 1/100
Epoch 1: accuracy improved from -inf to 0.88415, saving model to /tmp/CMS2_LSTM_CheckPoint.h5
Epoch 2/100
 1/88 [..............................] - ETA: 10s - loss: 0.3062 - accuracy: 0.9062

  saving_api.save_model(


Epoch 2: accuracy improved from 0.88415 to 0.88699, saving model to /tmp/CMS2_LSTM_CheckPoint.h5
Epoch 3/100
Epoch 3: accuracy did not improve from 0.88699
Epoch 4/100
Epoch 4: accuracy did not improve from 0.88699
Epoch 5/100
Epoch 5: accuracy did not improve from 0.88699
Epoch 6/100
Epoch 6: accuracy did not improve from 0.88699
Epoch 7/100
Epoch 7: accuracy did not improve from 0.88699
Epoch 8/100
Epoch 8: accuracy did not improve from 0.88699
Epoch 9/100
Epoch 9: accuracy did not improve from 0.88699
Epoch 10/100
Epoch 10: accuracy did not improve from 0.88699
Epoch 11/100
Epoch 11: accuracy did not improve from 0.88699
Epoch 12/100
Epoch 12: accuracy did not improve from 0.88699
Epoch 13/100
Epoch 13: accuracy did not improve from 0.88699
Epoch 14/100
Epoch 14: accuracy did not improve from 0.88699
Epoch 15/100
Epoch 15: accuracy improved from 0.88699 to 0.88735, saving model to /tmp/CMS2_LSTM_CheckPoint.h5
Epoch 16/100
Epoch 16: accuracy did not improve from 0.88735
Epoch 17/100


<keras.src.callbacks.History at 0x7fa1d05ed210>

# Load Best CheckPoint

In [18]:
cp_model = load_model('/tmp/CMS2_LSTM_CheckPoint.h5')
cp_model.evaluate(X_test, y_test, batch_size=1000)



[0.3114548921585083, 0.8897180557250977]

# Evaluate

In [19]:
y_pred = cp_model.predict(X_test)



In [20]:
preds_single = CLASSES[np.argmax(y_pred, axis = -1)]
actual_single = CLASSES[np.argmax(y_test, axis = -1)]

# Classification Results

In [21]:
y_pred_classes = tf.argmax(y_pred, axis=1)
y_test_classes = tf.argmax(y_test, axis=1)

print(classification_report(y_test_classes, y_pred_classes))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90       597
           1       1.00      0.78      0.88       609

    accuracy                           0.89      1206
   macro avg       0.91      0.89      0.89      1206
weighted avg       0.91      0.89      0.89      1206



# Confusion MatriX

In [16]:
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)

class_labels = ['Legitimate', 'Suspicious']

conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)

print("Confusion Matrix:")
print(conf_matrix_df)

Confusion Matrix:
            Legitimate  Suspicious
Legitimate         597           0
Suspicious         124         485
