### Libraries

In [31]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import datetime
import numpy as np
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)        

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

### GPU

In [32]:
DEVICE = 'CUDA' if torch.cuda.is_available() else 'CPU'
print("Using {}".format(DEVICE))

# Info on the device available memory
if DEVICE == 'CUDA':
    gpu = torch.device('cuda')
    total_memory = torch.cuda.get_device_properties(gpu).total_memory / 1024**3
    current_memory = torch.cuda.memory_allocated(gpu) / 1024**3

    print(f'Total GPU memory: {total_memory:.1f} GB | Current usage: {current_memory:.1f} GB')

Using CPU


### Load data

In [33]:
AUGMENTATION = 'RealTabFormer' # options: 'None', 'GReat', 'SMOTE', 'SMOTE-NC' or 'RealTabFormer'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id']) 

Loading complete.
Train data: 1500000 rows, 46 columns. 
Test data: 381934 rows, 46 columns.


### Data preparation

In [34]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [35]:
# Encode the training and test labels if needed
X_train_enc, X_test_enc, info = utils.encode_categorical(X_train, X_test, encoding='label')

Categorical features to be encoded:

mqtt.conack.flags
http.request.method
dns.qry.name.len
http.referer
http.request.version
mqtt.topic
mqtt.protoname

Encoding complete.
No of features before encoding: 44
No of features after encoding: 44


#### Label encoding

In [36]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model Training

In [37]:
# pytorch_tabnet default parameters
tabnet = TabNetClassifier()

tabnet.fit(X_train=X_train_enc.values, 
            y_train=y_train_enc,
            augmentations=None,
            max_epochs=100,
            )



epoch 0  | loss: 0.67829 |  0:01:04s
epoch 1  | loss: 0.56153 |  0:02:07s
epoch 2  | loss: 0.53293 |  0:03:11s
epoch 3  | loss: 0.48137 |  0:04:16s
epoch 4  | loss: 0.46756 |  0:05:22s
epoch 5  | loss: 0.4614  |  0:06:26s
epoch 6  | loss: 0.45605 |  0:07:29s
epoch 7  | loss: 0.45245 |  0:08:33s
epoch 8  | loss: 0.44822 |  0:09:38s
epoch 9  | loss: 0.44803 |  0:10:42s
epoch 10 | loss: 0.44524 |  0:11:45s
epoch 11 | loss: 0.44361 |  0:12:49s
epoch 12 | loss: 0.44244 |  0:13:53s
epoch 13 | loss: 0.44187 |  0:14:58s
epoch 14 | loss: 0.44064 |  0:16:01s
epoch 15 | loss: 0.44201 |  0:17:05s
epoch 16 | loss: 0.43827 |  0:18:09s
epoch 17 | loss: 0.43906 |  0:19:12s
epoch 18 | loss: 0.43927 |  0:20:16s
epoch 19 | loss: 0.43854 |  0:21:20s
epoch 20 | loss: 0.43878 |  0:22:23s
epoch 21 | loss: 0.43729 |  0:23:27s
epoch 22 | loss: 0.43593 |  0:24:31s
epoch 23 | loss: 0.43584 |  0:25:36s
epoch 24 | loss: 0.43632 |  0:26:40s
epoch 25 | loss: 0.43319 |  0:27:44s
epoch 26 | loss: 0.43361 |  0:28:48s
e

In [None]:
# # Shuffle training data
# # X_train_enc, y_train_enc = shuffle(X_train_enc, y_train_enc, random_state=42)

# if AUGMENTATION == 'SMOTE'or AUGMENTATION == 'SMOTE-NC':
#     # pytorch_tabnet default parameters
#     tabnet = TabNetClassifier()
    
#     tabnet.fit(X_train=X_train_enc.values, 
#                y_train=y_train_enc,
#                augmentations=None,
#                max_epochs=100,
#                )
# else: # AUGMENTATION == 'None', 'RealTabFormer', 'GReaT'


#     # retrieve the categorical features indexes and their dimension
#     cat_cols = info['categorical_columns']

#     cat_idxs = [X_train_enc.columns.get_loc(col) for col in cat_cols]
#     cat_dims = [X_train_enc[col].nunique() for col in cat_cols]
    
#     # cat_idxs = [X_train_enc.columns.get_loc(col) for col in cat_cols]
    
#     # # retrive unique values of the categorical columns in X_train_enc
#     # cat_dims = [len(X_train_enc[col].unique()) for col in cat_cols]
#     print(f"cat_cols: {cat_cols}")
#     print(f"cat_idxs: {cat_idxs}")
#     print(f"cat_dims: {cat_dims}")
#     # cat_emb_dim = [min(50, (x + 1) // 2) for x in cat_dims]
#     # Check if the indices are within the range of your dataset
#     max_index = X_train_enc.shape[1] - 1  # Assuming X_train_enc is your training dataset
#     for idx in cat_idxs:
#         if idx > max_index:
#             print(f"Index {idx} is out of range for the dataset.")

#     # Ensure that the dimensions align with the indices
#     if len(cat_idxs) != len(cat_dims):
#         print("The length of cat_idxs and cat_dims should be the same.")

#     # If everything is correct, initialize the TabNet model
#     tabnet = TabNetClassifier(cat_idxs=cat_idxs,
#                             cat_dims=cat_dims,
#                             cat_emb_dim=[min(50, (dim + 1) // 2) for dim in cat_dims],
#                             )
     
#     # tabnet = TabNetClassifier(cat_idxs=cat_idxs,
#     #                           cat_dims=cat_dims,
#     #                           cat_emb_dim=cat_emb_dim,    # categorical features embedding dimension
#     #                           )
#     tabnet.fit(X_train=X_train_enc.values, 
#                y_train=y_train_enc,
#                augmentations=None,
#                max_epochs=100,
#                )

In [None]:
# import pandas as pd

# # Assuming X_train_enc is your training dataset
# X_train_unique_values = X_train_enc.iloc[:, cat_idxs].nunique()

# # Check if unique values match specified dimensions
# for idx, dim, unique_values in zip(cat_idxs, cat_dims, X_train_unique_values):
#     print(f"Index: {idx} | Dimension: {dim} | Unique values: {unique_values}")

# # Check if the indices are within the range of your dataset
# max_index = X_train_enc.shape[1] - 1
# for idx in cat_idxs:
#     if idx > max_index:
#         print(f"Index {idx} is out of range for the dataset.")

# # Ensure that the dimensions align with the indices
# if len(cat_idxs) != len(cat_dims):
#     print("The length of cat_idxs and cat_dims should be the same.")


In [None]:
# # find columns indexes for categorical columns in X_train
# cat_cols = info['categorical_columns']
# cat_cols
# # retrive indexes of the categorical columns in X_train_enc
# cat_idxs = [X_train_enc.columns.get_loc(col) for col in cat_cols]
# # print column idx, name and number of unique values
# for col in cat_cols:
#     print(f"Index: {X_train_enc.columns.get_loc(col)} | Column: {col} | Unique values: {X_train_enc[col].nunique()}")


In [None]:
# check if X_train_enc, y_train_enc, X_test_enc, y_test_enc have NaN values in one line
print(f"X_train_enc has NaN values? {np.isnan(X_train_enc.values).any()}\ny_train_enc has NaN values? {np.isnan(y_train_enc).any()}")
print(f"X_test_enc  has NaN values? {np.isnan(X_test_enc.values).any()}\ny_test_enc  has NaN values? {np.isnan(y_test_enc).any()}")

X_train_enc has NaN values? False
y_train_enc has NaN values? False
X_test_enc  has NaN values? False
y_test_enc  has NaN values? False


In [None]:
# check if X_train_enc, y_train_enc, X_test_enc, y_test_enc have any categorical values
print(f"X_train_enc has categorical values? {X_train_enc.select_dtypes(include=['object']).any().any()}")
print(f"X_test_enc  has categorical values? {X_test_enc.select_dtypes(include=['object']).any().any()}")

X_train_enc has categorical values? False
X_test_enc  has categorical values? False


In [None]:
# # print the datatypes from the categorical columns
# print(f"X_train_enc categorical columns datatypes:\n{X_train_enc[cat_cols].dtypes}")


#### Save model

In [None]:
saved_filename = tabnet.save_model(f'checkpoints/tabnet/tabnet_{AUGMENTATION}')

Successfully saved model at checkpoints/tabnet/tabnet_GReaT.zip


### Model Evaluation

In [None]:
predictions = tabnet.predict(X_test_enc.values)

#### Metrics

In [None]:
accuracy = metrics.accuracy_score(y_test_enc, predictions)
precision_w = metrics.precision_score(y_test_enc, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test_enc, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test_enc, predictions, average='weighted')
precision_m = metrics.precision_score(y_test_enc, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test_enc, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test_enc, predictions, average='macro')

In [None]:
# Create dictionary for results
results = {
    "model": "TabNet",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

╒══════════════════════╤═════════╕
│ Metric               │ Value   │
╞══════════════════════╪═════════╡
│ Accuracy             │ 88.33%  │
├──────────────────────┼─────────┤
│ Precision (macro)    │ 58.06%  │
├──────────────────────┼─────────┤
│ Recall (macro)       │ 55.17%  │
├──────────────────────┼─────────┤
│ F1 (macro)           │ 48.08%  │
├──────────────────────┼─────────┤
│ Precision (weighted) │ 93.70%  │
├──────────────────────┼─────────┤
│ Recall (weighted)    │ 88.33%  │
├──────────────────────┼─────────┤
│ F1 (weighted)        │ 89.41%  │
╘══════════════════════╧═════════╛


#### Save Metrics Results 

In [None]:
# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/tabnet.csv')

#### Confusion matrix

In [None]:
conf_mat = metrics.confusion_matrix(y_test_enc, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                           index = attack_labels, 
                           columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,1595,0,0,0,0,0,2160,0,0,329,698,0,0,0,0
DDoS_HTTP,0,3278,0,0,0,0,0,0,0,2499,0,0,0,16,3835
DDoS_ICMP,0,0,7566,0,0,5607,138,0,0,190,0,0,0,0,0
DDoS_TCP,0,0,0,5816,0,0,0,0,0,4193,0,0,0,0,0
DDoS_UDP,0,0,0,0,24346,0,48,0,0,207,0,0,0,0,0
Fingerprinting,1,0,0,0,0,62,9,0,0,74,0,0,0,0,0
MITM,0,33,0,1,9,0,29,0,0,4,0,0,0,0,0
Normal,7,0,0,0,0,0,39,272730,0,0,0,0,0,0,0
Password,0,0,0,0,0,0,0,0,3058,893,0,2583,3574,0,0
Port_Scanning,0,0,0,0,0,0,0,0,0,4062,0,0,0,0,0
