In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed

from keras.models import Model
from keras.optimizers import Adam
from keras.models import load_model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, LSTM, BatchNormalization, Dropout, Dense, Add, Flatten

from tensorflow.keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

2024-07-30 22:11:26.431798: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 22:11:26.483882: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 22:11:26.483923: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 22:11:26.483955: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-30 22:11:26.493153: I tensorflow/core/platform/cpu_feature_g

# Load csv from Desktop

In [11]:
NUM_CLASSES = 2
CLASSES = np.array(['Legitimate', 'Suspicious'])
DATASET_DIR = "./"
VECTOR_LENGTH = 1 * 165

def csvToVector(file_path):
    data = pd.read_csv(file_path, header=None)
    vector = data.values.flatten()
    return vector

def process_file(class_idx, file_path):
    vector = csvToVector(file_path)
    return (vector, class_idx)

def load_data(dataset_dir):
    X = []
    y = []
    subdirs = ['benign_cms4', 'malware_cms4']
    futures = []

    with ThreadPoolExecutor() as executor:
        for class_idx, class_name in enumerate(subdirs):
            class_dir = os.path.join(dataset_dir, class_name)
            for file_name in os.listdir(class_dir):
                if file_name.endswith('.csv'):
                    file_path = os.path.join(class_dir, file_name)
                    futures.append(executor.submit(process_file, class_idx, file_path))

        for future in as_completed(futures):
            vector, class_idx = future.result()
            X.append(vector)
            y.append(class_idx)

    X = np.array(X)
    y = np.array(y)
    return X, y

In [12]:
X, y = load_data(DATASET_DIR)

In [13]:
print(X.shape)
print(y.shape)
print(X)
print(y)

(4020, 165)
(4020,)
[[   69 11048   106 ...     0   211     0]
 [   93 10187   137 ...     0   219     0]
 [   71 10281   160 ...     0   219     0]
 ...
 [  123 10134   186 ...     0   360     1]
 [   93  9696   127 ...     0   336     0]
 [  147 11389   145 ...     0   419     0]]
[0 0 0 ... 1 1 1]


# Train, Validation, Test Split and Nomalize

In [14]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=41)

#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train = X_train / 299.0
#X_val = X_val / 299.0
X_test = X_test / 299.0

y_train = to_categorical(y_train, 2)
#y_val = to_categorical(y_val, 2)
y_test = to_categorical(y_test, 2)

In [15]:
print(X_train.shape)
print(X_test.shape)
#print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
#print(y_val.shape)

(2814, 165)
(1206, 165)
(2814, 2)
(1206, 2)


# 1D CNN Architecture

In [16]:
input_layer = Input(shape=(VECTOR_LENGTH, 1))

x = LSTM(32, return_sequences=True)(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(64, return_sequences=True)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(128, return_sequences=False)(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

x = Flatten()(x)

x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

output_layer = Dense(NUM_CLASSES, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)

opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [17]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 165, 1)]          0         
                                                                 
 lstm_3 (LSTM)               (None, 165, 32)           4352      
                                                                 
 batch_normalization_4 (Bat  (None, 165, 32)           128       
 chNormalization)                                                
                                                                 
 dropout_4 (Dropout)         (None, 165, 32)           0         
                                                                 
 lstm_4 (LSTM)               (None, 165, 64)           24832     
                                                                 
 batch_normalization_5 (Bat  (None, 165, 64)           256       
 chNormalization)                                          

# CheckPoint

In [32]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, min_lr=0.00001)
checkpoint = ModelCheckpoint(
    filepath='/tmp/CMS4_LSTM_CheckPoint.h5',
    save_best_only=True,
    monitor='accuracy',
    mode='max',
    verbose=1
)

# Model Training

In [33]:
model.fit(X_train, y_train, epochs=30, batch_size=32, callbacks=[checkpoint])

Epoch 1/30
Epoch 1: accuracy improved from -inf to 0.95558, saving model to /tmp/CMS4_LSTM_CheckPoint.h5
Epoch 2/30
 4/88 [>.............................] - ETA: 1s - loss: 0.1222 - accuracy: 0.9453

  saving_api.save_model(


Epoch 2: accuracy did not improve from 0.95558
Epoch 3/30
Epoch 3: accuracy did not improve from 0.95558
Epoch 4/30
Epoch 4: accuracy did not improve from 0.95558
Epoch 5/30
Epoch 5: accuracy did not improve from 0.95558
Epoch 6/30
Epoch 6: accuracy did not improve from 0.95558
Epoch 7/30
Epoch 7: accuracy did not improve from 0.95558
Epoch 8/30
Epoch 8: accuracy did not improve from 0.95558
Epoch 9/30
Epoch 9: accuracy improved from 0.95558 to 0.95665, saving model to /tmp/CMS4_LSTM_CheckPoint.h5
Epoch 10/30
Epoch 10: accuracy did not improve from 0.95665
Epoch 11/30
Epoch 11: accuracy did not improve from 0.95665
Epoch 12/30
Epoch 12: accuracy did not improve from 0.95665
Epoch 13/30
Epoch 13: accuracy did not improve from 0.95665
Epoch 14/30
Epoch 14: accuracy did not improve from 0.95665
Epoch 15/30
Epoch 15: accuracy did not improve from 0.95665
Epoch 16/30
Epoch 16: accuracy did not improve from 0.95665
Epoch 17/30
Epoch 17: accuracy did not improve from 0.95665
Epoch 18/30
Epoch

<keras.src.callbacks.History at 0x7f89ec43a950>

# Load Best CheckPoint

In [34]:
cp_model = load_model('/tmp/CMS4_LSTM_CheckPoint.h5')
cp_model.evaluate(X_test, y_test, batch_size=1000)



[0.16192427277565002, 0.9519071578979492]

# Evaluate

In [35]:
y_pred = cp_model.predict(X_test)



In [36]:
preds_single = CLASSES[np.argmax(y_pred, axis = -1)]
actual_single = CLASSES[np.argmax(y_test, axis = -1)]

# Classification Results

In [37]:
y_pred_classes = tf.argmax(y_pred, axis=1)
y_test_classes = tf.argmax(y_test, axis=1)

print(classification_report(y_test_classes, y_pred_classes))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96       633
           1       0.98      0.92      0.95       573

    accuracy                           0.95      1206
   macro avg       0.95      0.95      0.95      1206
weighted avg       0.95      0.95      0.95      1206



# Confusion MatriX

In [38]:
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)

class_labels = ['Legitimate', 'Suspicious']

conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)

print("Confusion Matrix:")
print(conf_matrix_df)

Confusion Matrix:
            Legitimate  Suspicious
Legitimate         621          12
Suspicious          46         527
