### Libraries

In [1]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import datetime
import numpy as np
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)        

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

### GPU

In [2]:
DEVICE = 'CUDA' if torch.cuda.is_available() else 'CPU'
print("Using {}".format(DEVICE))

# Info on the device available memory
if DEVICE == 'CUDA':
    gpu = torch.device('cuda')
    total_memory = torch.cuda.get_device_properties(gpu).total_memory / 1024**3
    current_memory = torch.cuda.memory_allocated(gpu) / 1024**3

    print(f'Total GPU memory: {total_memory:.1f} GB | Current usage: {current_memory:.1f} GB')

Using CPU


### Load data

In [3]:
AUGMENTATION = 'SMOTE-NC'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id']) 

Loading complete.
Train data: 1500000 rows, 46 columns. 
Test data: 381934 rows, 46 columns.


### Data preparation

In [4]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [5]:
# Encode the training and test labels if needed
X_train_enc, X_test_enc, info = utils.encode_categorical(X_train, X_test, encoding='label')

Categorical features to be encoded:

mqtt.topic
dns.qry.name.len
http.request.method
http.referer
mqtt.protoname
http.request.version
mqtt.conack.flags

Encoding complete.
No of features before encoding: 44
No of features after encoding: 44


#### Label encoding

In [6]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model Training

In [7]:
# Shuffle training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

if AUGMENTATION == 'SMOTE'or AUGMENTATION == 'SMOTE-NC':
    # pytorch_tabnet default parameters
    tabnet = TabNetClassifier()
    
    tabnet.fit(X_train=X_train_enc.values, 
               y_train=y_train_enc,
               augmentations=None,
               max_epochs=100,
               )
else: # AUGMENTATION == 'None', 'RealTabFormer', 'GReaT'

    cat_idxs = [ i for i, f in enumerate(X_train_enc.columns) if f in cat_cols]
    cat_dims = [ cat_dims[f] for i, f in enumerate(X_train_enc.columns) if f in cat_dims]

    tabnet = TabNetClassifier(cat_idxs=cat_idxs,
                              cat_dims=cat_dims,
                              cat_emb_dim=10,    # categorical features embedding dimension
                              )
    tabnet.fit(X_train=X_train_enc.values, 
               y_train=y_train_enc,
               augmentations=None,
               max_epochs=100,
               )



epoch 0  | loss: 0.6073  |  0:01:02s
epoch 1  | loss: 0.49048 |  0:02:03s
epoch 2  | loss: 0.48044 |  0:03:05s
epoch 3  | loss: 0.47701 |  0:04:06s
epoch 4  | loss: 0.47755 |  0:05:08s
epoch 5  | loss: 0.48093 |  0:06:09s
epoch 6  | loss: 0.47259 |  0:07:10s
epoch 7  | loss: 0.46929 |  0:08:12s
epoch 8  | loss: 0.47726 |  0:09:13s
epoch 9  | loss: 0.4685  |  0:10:15s
epoch 10 | loss: 0.46693 |  0:11:16s
epoch 11 | loss: 0.46516 |  0:12:17s
epoch 12 | loss: 0.46731 |  0:13:18s
epoch 13 | loss: 0.4785  |  0:14:19s
epoch 14 | loss: 0.47082 |  0:15:21s
epoch 15 | loss: 0.46604 |  0:16:23s
epoch 16 | loss: 0.46504 |  0:17:24s
epoch 17 | loss: 0.46357 |  0:18:26s
epoch 18 | loss: 0.46389 |  0:19:28s
epoch 19 | loss: 0.46316 |  0:20:30s
epoch 20 | loss: 0.46177 |  0:21:31s
epoch 21 | loss: 0.46151 |  0:22:33s
epoch 22 | loss: 0.4604  |  0:23:35s
epoch 23 | loss: 0.46093 |  0:24:37s
epoch 24 | loss: 0.46072 |  0:25:40s
epoch 25 | loss: 0.45794 |  0:26:41s
epoch 26 | loss: 0.45805 |  0:27:44s
e

#### Save model

In [8]:
saved_filename = tabnet.save_model(f'checkpoints/tabnet/tabnet_{AUGMENTATION}')

Successfully saved model at checkpoints/tabnet/tabnet_SMOTE-NC.zip


### Model Evaluation

In [9]:
predictions = tabnet.predict(X_test_enc.values)

#### Metrics

In [10]:
accuracy = metrics.accuracy_score(y_test_enc, predictions)
precision_w = metrics.precision_score(y_test_enc, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test_enc, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test_enc, predictions, average='weighted')
precision_m = metrics.precision_score(y_test_enc, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test_enc, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test_enc, predictions, average='macro')

In [11]:
# Create dictionary for results
results = {
    "model": "TabNet",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

╒══════════════════════╤═════════╕
│ Metric               │ Value   │
╞══════════════════════╪═════════╡
│ Accuracy             │ 90.85%  │
├──────────────────────┼─────────┤
│ Precision (macro)    │ 81.50%  │
├──────────────────────┼─────────┤
│ Recall (macro)       │ 63.00%  │
├──────────────────────┼─────────┤
│ F1 (macro)           │ 60.37%  │
├──────────────────────┼─────────┤
│ Precision (weighted) │ 94.02%  │
├──────────────────────┼─────────┤
│ Recall (weighted)    │ 90.85%  │
├──────────────────────┼─────────┤
│ F1 (weighted)        │ 90.04%  │
╘══════════════════════╧═════════╛


#### Save Metrics Results 

In [12]:
# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/tabnet.csv')

#### Confusion matrix

In [13]:
conf_mat = metrics.confusion_matrix(y_test_enc, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                           index = attack_labels, 
                           columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4453,0,1,169,0,0,0,0,0,86,73,0,0,0,0
DDoS_HTTP,0,3290,0,0,0,0,0,0,0,0,0,0,0,6334,4
DDoS_ICMP,0,0,13501,0,0,0,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,5816,0,0,0,0,0,4193,0,0,0,0,0
DDoS_UDP,0,0,4129,0,20472,0,0,0,0,0,0,0,0,0,0
Fingerprinting,0,0,99,24,0,0,0,0,0,23,0,0,0,0,0
MITM,0,0,0,0,0,0,76,0,0,0,0,0,0,0,0
Normal,0,0,0,0,0,0,7,272743,0,0,0,0,26,0,0
Password,0,0,0,0,0,0,0,0,9884,0,0,224,0,0,0
Port_Scanning,0,0,0,2035,0,0,0,0,0,2027,0,0,0,0,0
