In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed

from keras.models import Model
from keras.optimizers import Adam
from keras.models import load_model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, LSTM, BatchNormalization, Dropout, Dense, Add, Flatten

from tensorflow.keras.regularizers import l2

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

2024-07-30 12:58:39.607220: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-30 12:58:39.658090: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-30 12:58:39.658125: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-30 12:58:39.658158: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-30 12:58:39.667122: I tensorflow/core/platform/cpu_feature_g

# Load csv from Desktop

In [2]:
NUM_CLASSES = 2
CLASSES = np.array(['Legitimate', 'Suspicious'])
DATASET_DIR = "./"
VECTOR_LENGTH = 1 * 275

def csvToVector(file_path):
    data = pd.read_csv(file_path, header=None)
    vector = data.values.flatten()
    return vector

def process_file(class_idx, file_path):
    vector = csvToVector(file_path)
    return (vector, class_idx)

def load_data(dataset_dir):
    X = []
    y = []
    subdirs = ['benign_cms3', 'malware_cms3']
    futures = []

    with ThreadPoolExecutor() as executor:
        for class_idx, class_name in enumerate(subdirs):
            class_dir = os.path.join(dataset_dir, class_name)
            for file_name in os.listdir(class_dir):
                if file_name.endswith('.csv'):
                    file_path = os.path.join(class_dir, file_name)
                    futures.append(executor.submit(process_file, class_idx, file_path))

        for future in as_completed(futures):
            vector, class_idx = future.result()
            X.append(vector)
            y.append(class_idx)

    X = np.array(X)
    y = np.array(y)
    return X, y

In [3]:
X, y = load_data(DATASET_DIR)

In [4]:
print(X.shape)
print(y.shape)
print(X)
print(y)

(4020, 275)
(4020,)
[[  146  9849   196 ... 17728  8577 16917]
 [  104  9651   191 ... 10045   418  9846]
 [   66  9577   144 ... 10014   593  9764]
 ...
 [  103  9382   148 ... 15659  6834 14347]
 [  118 10815   189 ... 10788   842 14653]
 [  147 11389   145 ...  4170   851  1572]]
[1 1 1 ... 1 1 1]


# Train, Validation, Test Split and Nomalize

In [5]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=41)

#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_train = X_train / 299.0
#X_val = X_val / 299.0
X_test = X_test / 299.0

y_train = to_categorical(y_train, 2)
#y_val = to_categorical(y_val, 2)
y_test = to_categorical(y_test, 2)

In [6]:
print(X_train.shape)
print(X_test.shape)
#print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
#print(y_val.shape)

(2814, 275)
(1206, 275)
(2814, 2)
(1206, 2)


# 1D CNN Architecture

In [7]:
input_layer = Input(shape=(VECTOR_LENGTH, 1))

x = LSTM(32, return_sequences=True)(input_layer)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(64, return_sequences=True)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

x = LSTM(128, return_sequences=False)(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

x = Flatten()(x)

x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

output_layer = Dense(NUM_CLASSES, activation='softmax')(x)

model = Model(inputs=input_layer, outputs=output_layer)

opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

2024-07-30 12:58:55.794301: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 31350 MB memory:  -> device: 0, name: CUDA GPU, pci bus id: 0000:06:00.0, compute capability: 7.0
2024-07-30 12:58:55.794851: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 31350 MB memory:  -> device: 1, name: CUDA GPU, pci bus id: 0000:2f:00.0, compute capability: 7.0
2024-07-30 12:58:55.795357: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1883] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 31350 MB memory:  -> device: 2, name: CUDA GPU, pci bus id: 0000:86:00.0, compute capability: 7.0


In [8]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 275, 1)]          0         
                                                                 
 lstm (LSTM)                 (None, 275, 32)           4352      
                                                                 
 batch_normalization (Batch  (None, 275, 32)           128       
 Normalization)                                                  
                                                                 
 dropout (Dropout)           (None, 275, 32)           0         
                                                                 
 lstm_1 (LSTM)               (None, 275, 64)           24832     
                                                                 
 batch_normalization_1 (Bat  (None, 275, 64)           256       
 chNormalization)                                            

# CheckPoint

In [9]:
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, min_lr=0.00001)
checkpoint = ModelCheckpoint(
    filepath='/tmp/CMS3_LSTM_CheckPoint.h5',
    save_best_only=True,
    monitor='accuracy',
    mode='max',
    verbose=1
)

# Model Training

In [10]:
model.fit(X_train, y_train, epochs=30, batch_size=32, callbacks=[reduce_lr, checkpoint])

Epoch 1/30


2024-07-30 12:59:02.313779: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907
2024-07-30 12:59:03.665727: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7ff1d4c05d40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-30 12:59:03.665759: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): CUDA GPU, Compute Capability 7.0
2024-07-30 12:59:03.665765: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): CUDA GPU, Compute Capability 7.0
2024-07-30 12:59:03.665770: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (2): CUDA GPU, Compute Capability 7.0
2024-07-30 12:59:03.671532: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-30 12:59:03.762405: I ./tensorflow/compiler/jit/device_compiler.h:186] Compi

Epoch 1: accuracy improved from -inf to 0.77790, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 2/30
 3/88 [>.............................] - ETA: 2s - loss: 1.4356 - accuracy: 0.9062

  saving_api.save_model(


Epoch 2: accuracy improved from 0.77790 to 0.88095, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 3/30
Epoch 3: accuracy improved from 0.88095 to 0.91720, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 4/30
Epoch 4: accuracy did not improve from 0.91720
Epoch 5/30
Epoch 5: accuracy improved from 0.91720 to 0.93035, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 6/30
Epoch 6: accuracy improved from 0.93035 to 0.93710, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 7/30
Epoch 7: accuracy improved from 0.93710 to 0.93852, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 8/30
Epoch 8: accuracy improved from 0.93852 to 0.94136, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 9/30
Epoch 9: accuracy did not improve from 0.94136
Epoch 10/30
Epoch 10: accuracy improved from 0.94136 to 0.94527, saving model to /tmp/CMS3_LSTM_CheckPoint.h5
Epoch 11/30
Epoch 11: accuracy did not improve from 0.94527
Epoch 12/30
Epoch 12: accuracy did not improve from 0.94527
Epoch 13/30
Epoch

<keras.src.callbacks.History at 0x7ff3dce87580>

# Load Best CheckPoint

In [11]:
cp_model = load_model('/tmp/CMS3_LSTM_CheckPoint.h5')
cp_model.evaluate(X_test, y_test, batch_size=1000)



[0.15131427347660065, 0.9676616787910461]

# Evaluate

In [12]:
y_pred = cp_model.predict(X_test)



In [13]:
preds_single = CLASSES[np.argmax(y_pred, axis = -1)]
actual_single = CLASSES[np.argmax(y_test, axis = -1)]

# Classification Results

In [14]:
y_pred_classes = tf.argmax(y_pred, axis=1)
y_test_classes = tf.argmax(y_test, axis=1)

print(classification_report(y_test_classes, y_pred_classes))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       631
           1       0.97      0.96      0.97       575

    accuracy                           0.97      1206
   macro avg       0.97      0.97      0.97      1206
weighted avg       0.97      0.97      0.97      1206



# Confusion MatriX

In [15]:
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)

class_labels = ['Legitimate', 'Suspicious']

conf_matrix_df = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)

print("Confusion Matrix:")
print(conf_matrix_df)

Confusion Matrix:
            Legitimate  Suspicious
Legitimate         616          15
Suspicious          24         551
