### Libraries

In [1]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os                              
import pandas as pd                     
import sys                              
sys.path.append(os.path.abspath('..'))
from src import utils
import numpy as np
import datetime
import re

'machine learning library imports'
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
%reset?

[1;31mDocstring:[0m
Resets the namespace by removing all names defined by the user, if
called without arguments, or by removing some types of objects, such
as everything currently in IPython's In[] and Out[] containers (see
the parameters for details).

Parameters
----------
-f
    force reset without asking for confirmation.
-s
    'Soft' reset: Only clears your namespace, leaving history intact.
    References to objects may be kept. By default (without this option),
    we do a 'hard' reset, giving you a new session and removing all
    references to objects from the current session.
--aggressive
    Try to aggressively remove modules from sys.modules ; this
    may allow you to reimport Python modules that have been updated and
    pick up changes, but can have unintended consequences.

in
    reset input history
out
    reset output history
dhist
    reset directory history
array
    reset only variables that are NumPy arrays

See Also
--------
reset_selective : invoked as ``%re

### Load data

In [3]:
data_dir = os.path.abspath('data')

# Non-augmented dataset
df_train = pd.read_csv(os.path.join(data_dir, 'train_smotenc.csv'), low_memory=False)
AUGMENTATION = 'None'

# SMOTE augmented dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_SMOTE.csv'), low_memory=False)
# AUGMENTATION = 'SMOTE'

# SMOTE-NC augmented dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_SMOTE_NC.csv'), low_memory=False)
# AUGMENTATION = 'SMOTE-NC'

# RealTabFormer augmentation dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_RealTabFormer.csv'), low_memory=False)
# AUGMENTATION = 'RealTabFormer'

# GReaT augmentation dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_GReaT.csv'), low_memory=False)
# AUGMENTATION = 'GReaT'

# Test data for all datasets
df_test = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_test.csv'), low_memory=False)

### Data preparation

In [4]:
# Drop columns mbtcp.unit_id and mbtcp.trans_id from train and test data    
#df_train = df_train.drop(['mbtcp.unit_id', 'mbtcp.trans_id'], axis=1)
#df_test = df_test.drop(['mbtcp.unit_id', 'mbtcp.trans_id'], axis=1)

# Creates X_train, y_train
X_train = df_train.drop([ 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [5]:
# Extract categorical features
categorical_features = X_train.select_dtypes(include="object").columns

# Concatenate X_train and X_test
X_comb = pd.concat([X_train[categorical_features], X_test[categorical_features]], axis=0)

# Apply one-hot encoding (get_dummies)
X_comb_enc = pd.get_dummies(X_comb, dtype='int8')

# Split back into X_train and X_test
X_train_enc, X_test_enc = train_test_split(
    X_comb_enc, test_size=len(X_test), random_state=42)

# Print the shape of X_train_enc and X_test_enc
print(f'X_train_enc shape: {X_train_enc.shape}, X_test_enc shape: {X_test_enc.shape}')

X_train_enc shape: (1500000, 53), X_test_enc shape: (381934, 53)


In [6]:
# Drop columns categorical_features from X_train and X_test 
X_train = X_train.drop(categorical_features, axis=1)
X_test = X_test.drop(categorical_features, axis=1)

# Concatenate X_train and X_train_enc
X_train = pd.concat([X_train.reset_index(drop=True), X_train_enc.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_enc.reset_index(drop=True)], axis=1)

# Print the shape of X_train and X_test
print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

X_train shape: (1500000, 92), X_test shape: (381934, 92)


#### Label encoding

In [7]:
# instantiate the label encoder
le = LabelEncoder()

# fit and encode the training labels
y_train = le.fit_transform(y_train)

# encode the test labels
y_test = le.transform(y_test)

print('Attack_type and encoded labels:\n')
for i, label in enumerate(le.classes_):
    print(f'{label:23s} {i:d}')

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model training

In [8]:
X_train.columns

Index(['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le',
       'icmp.unused', 'http.content_length', 'http.response', 'http.tls_port',
       'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
       'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
       'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.stream',
       'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.qry.type',
       'dns.retransmission', 'dns.retransmit_request',
       'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags',
       'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msgtype',
       'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len',
       'mbtcp.trans_id', 'mbtcp.unit_id', 'http.request.method_0',
       'http.request.method_0.0', 'http.request.method_GET',
       'http.request.method_OPTIONS', 'http.request.method_POST',
       'http.request.method_PROPFIND', 'http.request.method_PUT',
       'http.request.meth

In [10]:

#cleaned_columns

In [11]:
# define a regular expression pattern to match (),:; {}><[]$- 
pattern = r'[(),:;{}><\[\]\$\-]|_{2,}|\s+'

# clean X_train  column names using the regular expression pattern
cleaned_columns = [re.sub(pattern, '', col) for col in X_train.columns]

# check for duplicates and if they exist, add suffix to make them unique
unique_names = []
for name in cleaned_columns:
    if name in unique_names:
        # add suffix to make the name unique
        suffix = 1
        while f"{name}_{suffix}" in unique_names:
            suffix += 1
        unique_names.append(f"{name}_{suffix}")
    else:
        unique_names.append(name)

# Replace the column names with the cleaned and unique names
X_train.columns, X_test.columns = unique_names, unique_names

# print the unique feature names
print(unique_names)



['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le', 'icmp.unused', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.seq', 'udp.stream', 'udp.time_delta', 'dns.qry.name', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msgtype', 'mqtt.proto_len', 'mqtt.topic_len', 'mqtt.ver', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id', 'http.request.method_0', 'http.request.method_0.0', 'http.request.method_GET', 'http.request.method_OPTIONS', 'http.request.method_POST', 'http.request.method_PROPFIND', 'http.request.method_PUT', 'http.request.method_SEARCH', 'http.request.method_TRACE', 'http.referer___echo93e4r0CVE20146278trueechoecho', 'http.refer

In [12]:
# check if any of X_train column names have [, ] or <, > characters
for name in X_train.columns:
    if '[' in name or ']' in name or '<' in name or '>' in name:
        print(name)

In [13]:
# Shuffle training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Instantiate model with 10 decision trees
xgb = XGBClassifier(max_depth=8,
                    learning_rate=0.1,
                    n_estimators=10,
                    verbosity=1,
                    silent=None,
                    #objective="multi:softmax",
                    objective="multi:softprob",
                    booster='gbtree',
                    n_jobs=1,
                    nthread=None,
                    gamma=0,
                    min_child_weight=1,
                    max_delta_step=0,
                    subsample=0.7,
                    colsample_bytree=1,
                    colsample_bylevel=1,
                    colsample_bynode=1,
                    reg_alpha=0,
                    reg_lambda=1,
                    base_score=0.5,
                    random_state=42,
                    seed=42,
                    num_class= (le.classes_).size)

# Train the model
xgb_clf =  xgb.fit(X_train, y_train)

### Model Evaluation

In [14]:
predictions = xgb_clf.predict(X_test)

In [15]:
accuracy = metrics.accuracy_score(y_test, predictions)
precisionw = metrics.precision_score(y_test, predictions, average='weighted', zero_division=1)
recallw = metrics.recall_score(y_test, predictions, average='weighted')
f1_scorew = metrics.f1_score(y_test, predictions, average='weighted')
precisionm = metrics.precision_score(y_test, predictions, average='macro', zero_division=1)
recallm = metrics.recall_score(y_test, predictions, average='macro')
f1_scorem = metrics.f1_score(y_test, predictions, average='macro')


print("Model Evaluation Metrics")
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Accuracy: {:.2f}".format(accuracy))
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Precision (Weighted): {:.2f}".format(precisionw))
print("Recall (Weighted): {:.2f}".format(recallw))
print("F1(Weighted): {:.2f}".format(f1_scorew))
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Precision (Macro): {:.2f}".format(precisionm))
print("Recall (Macro): {:.2f}".format(recallm))
print("F1(Macro): {:.2f}".format(f1_scorem))
print("~~~~~~~~~~~~~~~~~~~~~~~~~")

Model Evaluation Metrics
~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: 0.94
~~~~~~~~~~~~~~~~~~~~~~~~~
Precision (Weighted): 0.96
Recall (Weighted): 0.94
F1(Weighted): 0.94
~~~~~~~~~~~~~~~~~~~~~~~~~
Precision (Macro): 0.85
Recall (Macro): 0.86
F1(Macro): 0.83
~~~~~~~~~~~~~~~~~~~~~~~~~


#### Save Metrics Results 

In [16]:
# create dictionary for results
results = {
    "model": "XGBoost",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision (weighted)": precisionw,
    "recall (weighted)": recallw,
    "f1 (weighted)": f1_scorew,
    "precision (macro)": precisionm,
    "recall (macro)": recallm,
    "f1 (macro)": f1_scorem
    }
# save results to csv   
utils.save_results_to_csv([results], 'results/metrics/xgb.csv')

#### Confusion Matrix

In [17]:
conf_mat = metrics.confusion_matrix(y_test, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                            index = attack_labels, 
                            columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4424,168,0,0,0,3,0,0,0,81,60,2,26,18,0
DDoS_HTTP,0,8665,0,0,0,29,0,370,120,0,0,145,135,41,123
DDoS_ICMP,0,0,13387,0,0,114,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,10009,0,0,0,0,0,0,0,0,0,0,0
DDoS_UDP,0,0,0,0,24590,11,0,0,0,0,0,0,0,0,0
Fingerprinting,0,24,5,0,0,95,0,0,0,22,0,0,0,0,0
MITM,0,0,0,0,0,1,75,0,0,0,0,0,0,0,0
Normal,0,9412,0,0,2,164,0,263013,22,0,0,3,102,52,6
Password,0,906,0,0,0,12,0,1901,6746,0,0,0,324,0,219
Port_Scanning,0,0,0,0,0,9,0,0,0,4052,1,0,0,0,0
