### Libraries

In [1]:
%%capture
%reset -f                 # reset variables in the workspace
'generic imports'
import os
import pandas as pd
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)
import numpy as np
import datetime

'machine learning library imports'
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

### Load data

In [2]:
# Define the augmentation method and the data directory

AUGMENTATION = 'GReaT' # options: 'None', 'GReat', 'SMOTE', 'SMOTE-NC' or 'RealTabFormer'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id'])     

Loading complete.
Train data: 1500000 rows, 46 columns. 
Test data: 381934 rows, 46 columns.


### Data preparation

In [3]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [4]:
# Encode the training and test labels if needed
X_train_enc, X_test_enc, info = utils.encode_categorical(X_train, X_test)

Categorical features to be encoded:

mqtt.conack.flags
mqtt.protoname
http.request.method
http.referer
dns.qry.name.len
mqtt.topic
http.request.version

Encoding complete.
No of features before encoding: 44
No of features after encoding: 100


#### Label encoding

In [5]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model training

In [6]:
# Shuffle training data
X_train, y_train = shuffle(X_train_enc, y_train_enc, random_state=42)

# Instantiate model
RF = RandomForestClassifier(random_state = 42)

# Train the model
RF_clf =  RF.fit(X_train, y_train)

### Model Evaluation

In [7]:
predictions = RF_clf.predict(X_test_enc)

In [8]:
# Calculate metrics 
accuracy = metrics.accuracy_score(y_test_enc, predictions)
precision_w = metrics.precision_score(y_test_enc, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test_enc, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test_enc, predictions, average='weighted')
precision_m = metrics.precision_score(y_test_enc, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test_enc, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test_enc, predictions, average='macro')

In [9]:
# Create dictionary for results
results = {
    "model": "Random Forest",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

╒══════════════════════╤═════════╕
│ Metric               │ Value   │
╞══════════════════════╪═════════╡
│ Accuracy             │ 97.34%  │
├──────────────────────┼─────────┤
│ Precision (macro)    │ 80.93%  │
├──────────────────────┼─────────┤
│ Recall (macro)       │ 89.65%  │
├──────────────────────┼─────────┤
│ F1 (macro)           │ 80.71%  │
├──────────────────────┼─────────┤
│ Precision (weighted) │ 98.27%  │
├──────────────────────┼─────────┤
│ Recall (weighted)    │ 97.34%  │
├──────────────────────┼─────────┤
│ F1 (weighted)        │ 97.75%  │
╘══════════════════════╧═════════╛


#### Save Metrics Results 

In [10]:
# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/randomforest.csv')

#### Confusion Matrix

In [11]:
conf_mat = metrics.confusion_matrix(y_test_enc, predictions)

# attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
# 'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
# 'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                           index = list(le.classes_), 
                           columns = list(le.classes_))
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4494,0,0,0,0,25,75,0,0,148,37,0,0,2,1
DDoS_HTTP,0,8422,0,0,0,13,0,0,0,0,0,0,0,175,1018
DDoS_ICMP,0,0,13495,0,0,6,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,9892,0,0,117,0,0,0,0,0,0,0,0
DDoS_UDP,0,0,0,0,22736,0,1865,0,0,0,0,0,0,0,0
Fingerprinting,2,0,16,0,0,85,10,0,0,25,8,0,0,0,0
MITM,0,0,0,0,0,0,76,0,0,0,0,0,0,0,0
Normal,0,0,0,0,0,0,0,272776,0,0,0,0,0,0,0
Password,0,97,0,0,0,526,144,0,8044,0,0,811,486,0,0
Port_Scanning,0,0,0,1,0,1,12,0,0,4044,4,0,0,0,0
