### Libraries

In [1]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import datetime
import numpy as np
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)        

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

### GPU

In [2]:
DEVICE = 'CUDA' if torch.cuda.is_available() else 'CPU'
print("Using {}".format(DEVICE))

# Info on the device available memory
if DEVICE == 'CUDA':
    gpu = torch.device('cuda')
    total_memory = torch.cuda.get_device_properties(gpu).total_memory / 1024**3
    current_memory = torch.cuda.memory_allocated(gpu) / 1024**3

    print(f'Total GPU memory: {total_memory:.1f} GB | Current usage: {current_memory:.1f} GB')

Using CPU


### Load data

In [3]:
AUGMENTATION = 'GReaT' # options: 'None', 'GReat', 'SMOTE', 'SMOTE-NC' or 'RealTabFormer'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id']) 

Loading complete.
Train data: 1500000 rows, 46 columns. 
Test data: 381934 rows, 46 columns.


### Data preparation

In [4]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [5]:
# Encode the training and test labels if needed
X_train_enc, X_test_enc, info = utils.encode_categorical(X_train, X_test, encoding='label')

Categorical features to be encoded:

http.referer
mqtt.topic
dns.qry.name.len
http.request.version
mqtt.protoname
http.request.method
mqtt.conack.flags

Encoding complete.
No of features before encoding: 44
No of features after encoding: 44


#### Label encoding

In [6]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model Training

In [7]:
# Shuffle training data
X_train_enc, y_train_enc = shuffle(X_train_enc, y_train_enc, random_state=42)

if AUGMENTATION == 'SMOTE'or AUGMENTATION == 'SMOTE-NC':
    # pytorch_tabnet default parameters
    tabnet = TabNetClassifier()
    
    tabnet.fit(X_train=X_train_enc.values, 
               y_train=y_train_enc,
               augmentations=None,
               max_epochs=100,
               )
else: # AUGMENTATION == 'None', 'RealTabFormer', 'GReaT'
    
    # retrieve the categorical features indexes and their dimension
    cat_cols = info['categorical_columns']
    cat_idxs = [ i for i, f in enumerate(X_train_enc.columns) if f in cat_cols]
    print(cat_idxs)
    # using X_train_enc.values instead of X_train_enc to avoid error
    cat_dims = [ len(set(X_train_enc.values[:, i])) for i in cat_idxs] 
    print(cat_dims)

    tabnet = TabNetClassifier(cat_idxs=cat_idxs,
                              cat_dims=cat_dims,
                              cat_emb_dim=10,    # categorical features embedding dimension
                              )
    tabnet.fit(X_train=X_train_enc.values, 
               y_train=y_train_enc,
               augmentations=None,
               max_epochs=100,
               )

[6, 7, 8, 25, 31, 39, 40]
[12, 14, 18, 8, 7, 3, 3]




IndexError: index out of range in self

In [113]:
# retrieve the categorical features indexes and their dimension
cat_cols = info['categorical_columns']
cat_idxs = [ i for i, f in enumerate(X_train_enc.columns) if f in cat_cols]
print(cat_idxs)
# using X_train_enc.values instead of X_train_enc to avoid error
cat_dims = [ len(set(X_train_enc.values[:, i])) for i in cat_idxs] 
print(cat_dims)

[6, 7, 8, 25, 31, 39, 40]
[9, 5, 13, 8, 7, 3, 3]


In [115]:
# calculate the categorical features indexes and their dimension from X_train
cat_idxs = [ i for i, f in enumerate(X_train.columns) if f in cat_cols]
print(cat_idxs)
# count unique values in each categorical feature
cat_dims = [ len(set(X_train.values[:, i])) for i in cat_idxs]
print(cat_dims)

[6, 7, 8, 25, 31, 39, 40]
[9, 5, 13, 8, 7, 3, 3]


In [None]:
# check if X_train_enc, y_train_enc, X_test_enc, y_test_enc have NaN values in one line
print(f"X_train_enc has NaN values? {np.isnan(X_train_enc.values).any()}\ny_train_enc has NaN values? {np.isnan(y_train_enc).any()}")
print(f"X_test_enc  has NaN values? {np.isnan(X_test_enc.values).any()}\ny_test_enc  has NaN values? {np.isnan(y_test_enc).any()}")

X_train_enc has NaN values? False
y_train_enc has NaN values? False
X_test_enc  has NaN values? False
y_test_enc  has NaN values? False


In [None]:
# check if X_train_enc, y_train_enc, X_test_enc, y_test_enc have any categorical values
print(f"X_train_enc has categorical values? {X_train_enc.select_dtypes(include=['object']).any().any()}")
print(f"X_test_enc  has categorical values? {X_test_enc.select_dtypes(include=['object']).any().any()}")

X_train_enc has categorical values? False
X_test_enc  has categorical values? False


In [112]:
# print the datatypes from the categorical columns
print(f"X_train_enc categorical columns datatypes:\n{X_train_enc[cat_cols].dtypes}")


X_train_enc categorical columns datatypes:
http.request.version    int64
mqtt.protoname          int64
mqtt.conack.flags       int64
http.referer            int64
http.request.method     int64
dns.qry.name.len        int64
mqtt.topic              int64
dtype: object


In [None]:
# Print the number of unique values for each categorical column
print("Number of unique values in each categorical column:")
for idx in cat_idxs:
    print(f"{X_train_enc.columns[idx]}: {len(X_train_enc.iloc[:, idx].unique())}")

Number of unique values in each categorical column:
http.request.method: 9
http.referer: 5
http.request.version: 13
dns.qry.name.len: 8
mqtt.conack.flags: 7
mqtt.protoname: 3
mqtt.topic: 3


#### Save model

In [None]:
saved_filename = tabnet.save_model(f'checkpoints/tabnet/tabnet_{AUGMENTATION}')

Successfully saved model at checkpoints/tabnet/tabnet_SMOTE-NC.zip


### Model Evaluation

In [None]:
predictions = tabnet.predict(X_test_enc.values)

#### Metrics

In [None]:
accuracy = metrics.accuracy_score(y_test_enc, predictions)
precision_w = metrics.precision_score(y_test_enc, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test_enc, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test_enc, predictions, average='weighted')
precision_m = metrics.precision_score(y_test_enc, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test_enc, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test_enc, predictions, average='macro')

In [None]:
# Create dictionary for results
results = {
    "model": "TabNet",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

╒══════════════════════╤═════════╕
│ Metric               │ Value   │
╞══════════════════════╪═════════╡
│ Accuracy             │ 90.85%  │
├──────────────────────┼─────────┤
│ Precision (macro)    │ 81.50%  │
├──────────────────────┼─────────┤
│ Recall (macro)       │ 63.00%  │
├──────────────────────┼─────────┤
│ F1 (macro)           │ 60.37%  │
├──────────────────────┼─────────┤
│ Precision (weighted) │ 94.02%  │
├──────────────────────┼─────────┤
│ Recall (weighted)    │ 90.85%  │
├──────────────────────┼─────────┤
│ F1 (weighted)        │ 90.04%  │
╘══════════════════════╧═════════╛


#### Save Metrics Results 

In [None]:
# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/tabnet.csv')

#### Confusion matrix

In [None]:
conf_mat = metrics.confusion_matrix(y_test_enc, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                           index = attack_labels, 
                           columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4453,0,1,169,0,0,0,0,0,86,73,0,0,0,0
DDoS_HTTP,0,3290,0,0,0,0,0,0,0,0,0,0,0,6334,4
DDoS_ICMP,0,0,13501,0,0,0,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,5816,0,0,0,0,0,4193,0,0,0,0,0
DDoS_UDP,0,0,4129,0,20472,0,0,0,0,0,0,0,0,0,0
Fingerprinting,0,0,99,24,0,0,0,0,0,23,0,0,0,0,0
MITM,0,0,0,0,0,0,76,0,0,0,0,0,0,0,0
Normal,0,0,0,0,0,0,7,272743,0,0,0,0,26,0,0
Password,0,0,0,0,0,0,0,0,9884,0,0,224,0,0,0
Port_Scanning,0,0,0,2035,0,0,0,0,0,2027,0,0,0,0,0
