### Libraries

In [53]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)
from psutil import virtual_memory    
import datetime
import numpy as np                 

'machine learning imports'
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [35]:
print('TensorFlow version:', tf.__version__)

TensorFlow version: 2.12.0


### GPU

In [36]:
# if gpu available print name, else use cpu
if tf.test.is_gpu_available():
    print('GPU:', tf.test.gpu_device_name())
else:
    print('CPU:', tf.config.list_physical_devices('CPU'))

CPU: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


2023-10-27 23:55:18.143798: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-27 23:55:18.144048: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


### Load Data

In [37]:
# Define the augmentation method and the data directory
AUGMENTATION = 'SMOTE'
data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mqtt.topic_0.0.1', 
                                                       'Unnamed: 0', 
                                                       'mqtt.topic_Temperature_and_Humidity', 
                                                       'mbtcp.unit_id', 
                                                       'mbtcp.trans_id'])   

Loading complete.
Training data: 1500000 rows, 98 columns. 
Test data: 381934 rows, 98 columns.


In [38]:
# def load_and_validate_data(data_dir, augmentation='None'):
#     """
#     Load and validate training and test data based on the augmentation option.

#     Parameters:
#     - data_dir: str, path to the directory containing data files.
#     - augmentation: str, augmentation option ('None', 'SMOTE', 'SMOTE-NC', 'RealTabFormer', 'GReaT').

#     Returns:
#     - df_train: pd.DataFrame, training dataset.
#     - df_test: pd.DataFrame, test dataset.
#     """

#     # Define file paths based on augmentation option
#     file_paths = {
#         'None': {'train': 'EdgeIIot_train_100k.csv', 'test' : 'EdgeIIot_test.csv'},
#         'SMOTE': {'train': 'EdgeIIot_train_100k_SMOTE.csv', 'test' : 'EdgeIIot_test.csv'},
#         'SMOTE-NC': {'train': 'EdgeIIot_train_100k_SMOTE_NC.csv', 'test' : 'EdgeIIot_test.csv'},
#         'RealTabFormer': {'train': 'EdgeIIot_train_100k_RealTabFormer.csv', 'test' : 'EdgeIIot_test.csv'},
#         'GReaT': {'train': 'EdgeIIot_train_100k_GReaT.csv', 'test' : 'EdgeIIot_test.csv'},
#     }

#     # Validate augmentation option
#     if augmentation not in file_paths:
#         raise ValueError("AUGMENTATION option not recognized. Please choose between 'None', 'SMOTE', 'SMOTE-NC', 'RealTabFormer', or 'GReaT'.")

#     # Load training data
#     df_train_path = os.path.join(data_dir, file_paths[augmentation]['train'])
#     df_train = pd.read_csv(df_train_path, low_memory=False)

#     # Load test data
#     df_test_path = os.path.join(data_dir, file_paths[augmentation]['test'])
#     df_test = pd.read_csv(df_test_path, low_memory=False)

#     # Validate if test data has the same columns as training data
#     if not set(df_train.columns) == set(df_test.columns):
#         raise ValueError("Columns in the test data do not match columns in the training data.")

#     # Print information about datasets
#     print(f"Training data shape: {df_train.shape}, Test data shape: {df_test.shape}")

#     return df_train, df_test

# # Example usage:
# data_dir = os.path.abspath('../data')
# AUGMENTATION = 'None'
# df_train, df_test = load_and_validate_data(data_dir, augmentation=AUGMENTATION)


### Data Preparation

In [39]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [40]:
# One-hot encode the training and test labels if needed
X_train_enc, X_test_enc = utils.one_hot_encode_categorical(X_train, X_test, random_state=42)

No categorical features found. Returning original datasets.


In [41]:
# # Concatenate X_train and X_test
# X_comb = pd.concat([X_train[categorical_features], X_test[categorical_features]], axis=0)

# # Apply one-hot encoding (get_dummies)
# X_comb_enc = pd.get_dummies(X_comb, columns=categorical_features, drop_first=True,dtype='int8')

# # Split back into X_train and X_test
# X_train_enc, X_test_enc = train_test_split(
#     X_comb_enc, test_size=len(X_test), random_state=42)

# # Print the shape of X_train_enc and X_test_enc
# print(f'X_train_enc shape: {X_train_enc.shape}, X_test_enc shape: {X_test_enc.shape}')

In [42]:
# # Drop columns categorical_features from X_train and X_test 
# X_train = X_train.drop(categorical_features, axis=1)
# X_test = X_test.drop(categorical_features, axis=1)

# # Concatenate X_train and X_test with X_train_enc and X_test_enc and drop index column
# X_train = pd.concat([X_train.reset_index(drop=True), X_train_enc.reset_index(drop=True)], axis=1)
# X_test = pd.concat([X_test.reset_index(drop=True), X_test_enc.reset_index(drop=True)], axis=1)

# # Print the shape of X_train and X_test
# print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

#### Label Encoding

In [43]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

y_train_bin = tf.keras.utils.to_categorical(y_train_enc)
y_test_bin = tf.keras.utils.to_categorical(y_test_enc)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


#### One-hot encoding of labels

#### Standardization of Data

In [55]:
X_train_scaled, X_test_scaled = utils.scale_data(X_train, X_test, scaler_type='standard')

{'Test Data': {'Mean': 22.02, 'Standard Deviation': 123.96},
 'Train Data': {'Mean': 0.0, 'Standard Deviation': 0.95}}


In [52]:
# from sklearn.preprocessing import StandardScaler, RobustScaler

# # Instantiate the MinMaxScaler
# scaler = StandardScaler



# # Fit the scaler to the training data and transform
# X_train = scaler.fit_transform(X_train_enc)

# # Transform the test data
# X_test = scaler.transform(X_test_enc)

# # Mean and standard deviation of X_train and X_test
# print(f'X_train mean: {X_train.mean():.2f}, X_train std: {X_train.std():.2f}')
# print(f'X_test  mean: {X_test.mean():.2f},  X_test  std: {X_test.std():.2f}')


X_train mean: 0.09, X_train std: 0.28
X_test  mean: 0.10,  X_test  std: 0.29


In [51]:
X_train_enc

Unnamed: 0,arp.opcode,arp.hw.size,icmp.checksum,icmp.seq_le,icmp.unused,http.content_length,http.response,http.tls_port,tcp.ack,tcp.ack_raw,...,mqtt.conack.flags_1461589,mqtt.conack.flags_1461591,mqtt.conack.flags_1574358,mqtt.conack.flags_1574359,mqtt.protoname_0.0,mqtt.protoname_0,mqtt.protoname_0.0.1,mqtt.protoname_MQTT,mqtt.topic_0.0,mqtt.topic_0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,303.000000,3.453509e+09,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000e+00,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,1.080021e+09,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1499995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,103.317539,2.856644e+09,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1499996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,292.818834,1.545400e+09,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1499997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,441.000000,8.813645e+08,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1499998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,3.087147e+09,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Model Training

In [56]:
# Define the model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(len(le.classes_), activation='softmax')) 

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 

# ReduceLROnPlateau callback
monitor = tf.keras.callbacks.ReduceLROnPlateau(monitor="loss",
                                               factor=0.3,
                                               mode="min",
                                               patience=10,
                                               verbose=1,
                                               min_lr=1e-8)

# Checkpoint callback                                                
checkpoint = ModelCheckpoint('../checkpoints/neural_net/best_model_multiclass.h5', 
                              monitor='loss', 
                              save_best_only=True)

In [57]:
X_train.shape, y_train_bin.shape

((1500000, 96), (1500000, 15))

In [58]:
# Shuffle training data
X_train, y_train_bin = shuffle(X_train, y_train_bin, random_state=42)

# Train the model
history = model.fit(X_train, 
                    y_train_bin, 
                    epochs=100, 
                    batch_size=512, 
                    callbacks=[monitor, checkpoint])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 58: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/

### Model Evaluation

In [None]:
# predict probabilities for test set and get the index of the highest probability
predictions = tf.argmax(model.predict(X_test), axis=1)
predictions[:3]



In [None]:
# Calculate metrics 
accuracy = metrics.accuracy_score(tf.argmax(y_test, axis=1), predictions)
precision_m = metrics.precision_score(tf.argmax(y_test, axis=1), predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(tf.argmax(y_test, axis=1), predictions, average='macro')
f1_score_m = metrics.f1_score(tf.argmax(y_test, axis=1), predictions, average='macro')
precision_w = metrics.precision_score(tf.argmax(y_test, axis=1), predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(tf.argmax(y_test, axis=1), predictions, average='weighted')
f1_score_w = metrics.f1_score(tf.argmax(y_test, axis=1), predictions, average='weighted')

print("Model Evaluation Metrics")
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Accuracy: {:.2f}".format(accuracy))
print("Precision (macro): {:.2f}".format(precision_m))
print("Recall (macro): {:.2f}".format(recall_m))
print("F1 (macro): {:.2f}".format(f1_score_m))
print("Precision (weighted): {:.2f}".format(precision_w))
print("Recall (weighted): {:.2f}".format(recall_w))
print("F1 (weighted): {:.2f}".format(f1_score_w))
print("~~~~~~~~~~~~~~~~~~~~~~~~~")

Model Evaluation Metrics
~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: 0.95
Precision (macro): 0.86
Recall (macro): 0.77
F1 (macro): 0.78
Precision (weighted): 0.96
Recall (weighted): 0.95
F1 (weighted): 0.94
~~~~~~~~~~~~~~~~~~~~~~~~~


#### Save Metrics Results 

In [None]:
# create dictionary for results
results = {
    "model": "Neural Net",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/neural_net.csv')

#### Confusion Matrix

In [None]:
conf_mat = metrics.confusion_matrix(tf.argmax(y_test, axis=1), predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                            index = attack_labels, 
                            columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'


# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4700,0,0,81,1,0,0,0,0,0,0,0,0,0,0
DDoS_HTTP,0,9171,0,0,0,0,0,0,0,0,0,0,0,5,452
DDoS_ICMP,0,0,13463,0,5,33,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,10009,0,0,0,0,0,0,0,0,0,0,0
DDoS_UDP,0,0,0,0,24601,0,0,0,0,0,0,0,0,0,0
Fingerprinting,24,0,25,23,5,68,0,0,0,0,0,0,0,0,1
MITM,0,0,0,0,0,0,76,0,0,0,0,0,0,0,0
Normal,0,2,0,3,0,0,0,272766,0,0,0,0,0,4,1
Password,0,0,0,0,0,0,0,0,1768,0,0,7446,894,0,0
Port_Scanning,5,0,0,2027,0,0,0,0,0,2030,0,0,0,0,0
