In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed

from keras.models import Model
from keras.optimizers import Adam
from keras.models import load_model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, LSTM, BatchNormalization, Dropout, Dense, Add, Flatten

from tensorflow.keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

2024-07-30 12:02:24.058992: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 12:02:24.110740: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 12:02:24.110775: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 12:02:24.110808: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-30 12:02:24.119853: I tensorflow/core/platform/cpu_feature_g

# Load csv from Desktop

In [2]:
NUM_CLASSES = 2
CLASSES = np.array(['Legitimate', 'Suspicious'])
DATASET_DIR = "./"
VECTOR_LENGTH = 1 * 816

def csvToVector(file_path):
    data = pd.read_csv(file_path, header=None)
    vector = data.values.flatten()
    return vector

def process_file(class_idx, file_path):
    vector = csvToVector(file_path)
    return (vector, class_idx)

def load_data(dataset_dir):
    X = []
    y = []
    subdirs = ['benign_cms1', 'malware_cms1']
    futures = []

    with ThreadPoolExecutor() as executor:
        for class_idx, class_name in enumerate(subdirs):
            class_dir = os.path.join(dataset_dir, class_name)
            for file_name in os.listdir(class_dir):
                if file_name.endswith('.csv'):
                    file_path = os.path.join(class_dir, file_name)
                    futures.append(executor.submit(process_file, class_idx, file_path))

        for future in as_completed(futures):
            vector, class_idx = future.result()
            X.append(vector)
            y.append(class_idx)

    X = np.array(X)
    y = np.array(y)
    return X, y

In [3]:
X, y = load_data(DATASET_DIR)

In [4]:
print(X.shape)
print(y.shape)
print(X)
print(y)

(4020, 816)
(4020,)
[[  192     0     0 ...  9783     0    27]
 [  201     0     0 ...  8877     0   239]
 [  226     0     0 ... 29687     0    37]
 ...
 [  148     0     0 ...  3457     0    56]
 [  127     0     0 ...  2675     0    59]
 [  186     0     0 ...  2669     0    57]]
[0 0 0 ... 1 1 1]


# Train, Validation, Test Split and Nomalize

In [5]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)

#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train = X_train / 299.0
#X_val = X_val / 299.0
X_test = X_test / 299.0

y_train = to_categorical(y_train, 2)
#y_val = to_categorical(y_val, 2)
y_test = to_categorical(y_test, 2)

In [6]:
print(X_train.shape)
print(X_test.shape)
#print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
#print(y_val.shape)

(2814, 816)
(1206, 816)
(2814, 2)
(1206, 2)


# 1D CNN Architecture

In [7]:
input_layer = Input(shape=(VECTOR_LENGTH, 1))

x = LSTM(32, return_sequences=True)(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(64, return_sequences=True)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(128, return_sequences=False)(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

x = Flatten()(x)

x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

output_layer = Dense(NUM_CLASSES, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)

opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

2024-07-30 12:03:11.623199: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31350 MB memory:  -> device: 0, name: CUDA GPU, pci bus id: 0000:06:00.0, compute capability: 7.0
2024-07-30 12:03:11.623744: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 31350 MB memory:  -> device: 1, name: CUDA GPU, pci bus id: 0000:2f:00.0, compute capability: 7.0
2024-07-30 12:03:11.624224: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 31350 MB memory:  -> device: 2, name: CUDA GPU, pci bus id: 0000:86:00.0, compute capability: 7.0


In [8]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 816, 1)]          0         
                                                                 
 lstm (LSTM)                 (None, 816, 32)           4352      
                                                                 
 batch_normalization (Batch  (None, 816, 32)           128       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 816, 32)           0         
                                                                 
 lstm_1 (LSTM)               (None, 816, 64)           24832     
                                                                 
 batch_normalization_1 (Bat  (None, 816, 64)           256       
 chNormalization)                                            

# CheckPoint

In [9]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, min_lr=0.00001)
checkpoint = ModelCheckpoint(
    filepath='/tmp/CMS1_LSTM_CheckPoint.h5',
    save_best_only=True,
    monitor='accuracy',
    mode='max',
    verbose=1
)

# Model Training

In [10]:
model.fit(X_train, y_train, epochs=30, batch_size=32, callbacks=[reduce_lr, checkpoint])

Epoch 1/30


2024-07-30 12:03:18.266952: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907
2024-07-30 12:03:19.620797: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fcde86e4330 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-30 12:03:19.620833: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): CUDA GPU, Compute Capability 7.0
2024-07-30 12:03:19.620839: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): CUDA GPU, Compute Capability 7.0
2024-07-30 12:03:19.620845: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (2): CUDA GPU, Compute Capability 7.0
2024-07-30 12:03:19.626223: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-30 12:03:19.716255: I ./tensorflow/compiler/jit/device_compiler.h:186] Compi

Epoch 1: accuracy improved from -inf to 0.72210, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 2/30
 1/88 [..............................] - ETA: 6s - loss: 1.6431 - accuracy: 0.8750

  saving_api.save_model(


Epoch 2: accuracy improved from 0.72210 to 0.79709, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 3/30
Epoch 3: accuracy improved from 0.79709 to 0.81308, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 4/30
Epoch 4: accuracy improved from 0.81308 to 0.84009, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 5/30
Epoch 5: accuracy did not improve from 0.84009
Epoch 6/30
Epoch 6: accuracy improved from 0.84009 to 0.85217, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 7/30
Epoch 7: accuracy improved from 0.85217 to 0.85714, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 8/30
Epoch 8: accuracy did not improve from 0.85714
Epoch 9/30
Epoch 9: accuracy improved from 0.85714 to 0.86176, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 10/30
Epoch 10: accuracy improved from 0.86176 to 0.86247, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 11/30
Epoch 11: accuracy improved from 0.86247 to 0.86354, saving model to /tmp/CMS1_LSTM_CheckPoint.h5
Epoch 12/30
Epoch 12: accur

<keras.src.callbacks.History at 0x7fcfc864f610>

# Load Best CheckPoint

In [11]:
cp_model = load_model('/tmp/CMS1_LSTM_CheckPoint.h5')
cp_model.evaluate(X_test, y_test, batch_size=1000)



[0.17363925278186798, 0.9353233575820923]

# Evaluate

In [12]:
y_pred = cp_model.predict(X_test)



In [13]:
preds_single = CLASSES[np.argmax(y_pred, axis = -1)]
actual_single = CLASSES[np.argmax(y_test, axis = -1)]

# Classification Results

In [14]:
y_pred_classes = tf.argmax(y_pred, axis=1)
y_test_classes = tf.argmax(y_test, axis=1)

print(classification_report(y_test_classes, y_pred_classes))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94       622
           1       0.93      0.94      0.93       584

    accuracy                           0.94      1206
   macro avg       0.94      0.94      0.94      1206
weighted avg       0.94      0.94      0.94      1206



# Confusion MatriX

In [15]:
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)

class_labels = ['Legitimate', 'Suspicious']

conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)

print("Confusion Matrix:")
print(conf_matrix_df)

Confusion Matrix:
            Legitimate  Suspicious
Legitimate         581          41
Suspicious          37         547
