### Libraries

In [5]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import datetime
import numpy as np
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)        

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

### GPU

In [6]:
DEVICE = 'CUDA' if torch.cuda.is_available() else 'CPU'
print("Using {}".format(DEVICE))

# Info on the device available memory
if DEVICE == 'CUDA':
    gpu = torch.device('cuda')
    total_memory = torch.cuda.get_device_properties(gpu).total_memory / 1024**3
    current_memory = torch.cuda.memory_allocated(gpu) / 1024**3

    print(f'Total GPU memory: {total_memory:.1f} GB | Current usage: {current_memory:.1f} GB')

Using CPU


### Load data

In [7]:
AUGMENTATION = 'SMOTE'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id']) 

Loading complete.
Train data: 1500000 rows, 85 columns. 
Test data: 381934 rows, 85 columns.


### Data preparation

In [8]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### NOT NECESSSARY Convert categorical features to one-hot encoded features

In [9]:
# Encode the training and test labels if needed
# X_train_enc, X_test_enc = utils.encode_categorical(X_train, X_test)

# TabNet does not need encoded labels

No categorical features found. Returning original datasets.


In [11]:
# valid = False

# if valid == True:
#     def get_unique_values(df, columns):
#         unique_values = df[columns].apply(lambda x: ''.join(x.astype(str)), axis=1).unique()
#         unique_values.sort()
#         return unique_values

#     http_referer_columns = [col for col in X_train.columns if 'http.referer' in col]
#     http_request_version_columns = [col for col in X_train.columns if 'http.request.version' in col]
#     dns_qry_name_len_columns = [col for col in X_train.columns if 'dns.qry.name.len' in col]
#     mqtt_conack_flags_columns = [col for col in X_train.columns if 'mqtt.conack.flags' in col]
#     mqtt_protoname_columns = [col for col in X_train.columns if 'mqtt.protoname' in col]
#     http_request_method_columns = [col for col in X_train.columns if 'http.request.method' in col]

#     unique_values_http_referer = get_unique_values(X_train, http_referer_columns)
#     unique_values_http_request_version = get_unique_values(X_train, http_request_version_columns)
#     unique_values_dns_qry_name_len = get_unique_values(X_train, dns_qry_name_len_columns)
#     unique_values_mqtt_conack_flags = get_unique_values(X_train, mqtt_conack_flags_columns)
#     unique_values_mqtt_protoname = get_unique_values(X_train, mqtt_protoname_columns)

#     # Apply the same function to X_test
#     unique_values_http_referer_test = get_unique_values(X_test, http_referer_columns)
#     unique_values_http_request_version_test = get_unique_values(X_test, http_request_version_columns)
#     unique_values_dns_qry_name_len_test = get_unique_values(X_test, dns_qry_name_len_columns)
#     unique_values_mqtt_conack_flags_test = get_unique_values(X_test, mqtt_conack_flags_columns)
#     unique_values_mqtt_protoname_test = get_unique_values(X_test, mqtt_protoname_columns)
    
#     # Join the unique values from both train and test data and print them
#     unique_values_http_referer = np.unique(np.concatenate((unique_values_http_referer, unique_values_http_referer_test)))
#     unique_values_http_request_version = np.unique(np.concatenate((unique_values_http_request_version, unique_values_http_request_version_test)))
#     unique_values_dns_qry_name_len = np.unique(np.concatenate((unique_values_dns_qry_name_len, unique_values_dns_qry_name_len_test)))
#     unique_values_mqtt_conack_flags = np.unique(np.concatenate((unique_values_mqtt_conack_flags, unique_values_mqtt_conack_flags_test)))
#     unique_values_mqtt_protoname = np.unique(np.concatenate((unique_values_mqtt_protoname, unique_values_mqtt_protoname_test)))

#     print(f'Unique values for http.referer: \n{unique_values_http_referer}\n')
#     print(f'Unique values for http.request.version: \n{unique_values_http_request_version}\n')
#     print(f'Unique values for dns.qry.name.len: \n{unique_values_dns_qry_name_len}\n')
#     print(f'Unique values for mqtt.conack.flags: \n{unique_values_mqtt_conack_flags}\n')
#     print(f'Unique values for mqtt.protoname: \n{unique_values_mqtt_protoname}\n')
    
#     # check if X_train and X_test have categorical features
#     print(f'X_train categorical features: {X_train.select_dtypes(include="object").columns}')
#     print(f'X_test categorical features: {X_test.select_dtypes(include="object").columns}')

In [21]:
# # AQUI Categorical columns in df_train
# categorical_columns = [f for f in features if f in df_train.select_dtypes(include="object").columns]

# # Concatenate X_train and X_test
# X_comb = pd.concat([X_train[categorical_columns], X_test[categorical_columns]], axis=0)

# # Apply one-hot encoding (get_dummies)
# X_comb_enc = pd.get_dummies(X_comb, dtype='int8')

# # Split back into X_train and X_test
# X_train_enc, X_test_enc = train_test_split(
#     X_comb_enc, test_size=len(X_test), random_state=42)

# # Print the shape of X_train_enc and X_test_enc
# print(f'X_train_enc shape: {X_train_enc.shape}, X_test_enc shape: {X_test_enc.shape}')


# # converts X_train and y_train to numpy arrays
# X_train = df_train[features]
# y_train = df_train["Attack_type"]

# # converts X_test and y_test to numpy arrays
# X_test = df_test[features]
# y_test = df_test["Attack_type"]

# # size of X_train, y_train  X_test, y_test
# print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
# print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')
# print(cat_idxs)

#### Label encoding

In [10]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model Training

In [None]:
# TabModel(n_d: int = 8,
#         n_a: int = 8, 
#         n_steps: int = 3, 
#         gamma: float = 1.3, 
#         cat_idxs: List[int] = <factory>, 
#         cat_dims: List[int] = <factory>, 
#         cat_emb_dim: int = 1, 
#         n_independent: int = 2, 
#         n_shared: int = 2, 
#         epsilon: float = 1e-15, 
#         momentum: float = 0.02, 
#         lambda_sparse: float = 0.001, 
#         seed: int = 0, 
#         clip_value: int = 1, 
#         verbose: int = 1, 
#         optimizer_fn: Any = <class 'torch.optim.adam.Adam'>, 
#         optimizer_params: Dict = <factory>, 
#         scheduler_fn: Any = None, 
#         scheduler_params: Dict = <factory>, 
#         mask_type: str = 'sparsemax', 
#         input_dim: int = None, 
#         output_dim: int = None, 
#         device_name: str = 'auto', 
#         n_shared_decoder: int = 1, 
#         n_indep_decoder: int = 1, 
#         grouped_features: List[List[int]] = <factory>
#         )

# fit(X_train, 
#     y_train, 
#     eval_set=None, 
#     eval_name=None, 
#     eval_metric=None, 
#     loss_fn=None, 
#     weights=0, 
#     max_epochs=100, 
#     patience=10, 
#     batch_size=1024, 
#     virtual_batch_size=128, 
#     num_workers=0, 
#     drop_last=True, 
#     callbacks=None, 
#     pin_memory=True, 
#     from_unsupervised=None, 
#     warm_start=False, 
#     augmentations=None, 
#     compute_importance=True)

In [13]:
if AUGMENTATION == 'SMOTE'or AUGMENTATION == 'SMOTE-NC':
    # pytorch_tabnet default parameters
    tabnet = TabNetClassifier()
    
    tabnet.fit(X_train=X_train.values, 
               y_train=y_train,
               augmentations=None,
               max_epochs=100,
               )
else: # AUGMENTATION == 'None', 'RealTabFormer', GReaT
    
    # calculates the indices of the categorical features
    cat_idxs = [X_train.columns.get_loc(col) for col in X_train.select_dtypes(include="object").columns]
    # calculates the number of unique values in each categorical feature
    cat_dims = [len(X_train[col].unique()) for col in X_train.select_dtypes(include="object").columns]

    tabnet = TabNetClassifier(cat_idxs=cat_idxs,
                              cat_dims=cat_dims,
                              cat_emb_dim=10,    # categorical features embedding dimension
                              )
    tabnet.fit(X_train=X_train.values, 
               y_train=y_train,
               max_epochs=100,
               )



epoch 0  | loss: 0.55689 |  0:02:18s
epoch 1  | loss: 0.47827 |  0:04:43s
epoch 2  | loss: 0.46003 |  0:07:05s
epoch 3  | loss: 0.45115 |  0:09:35s
epoch 4  | loss: 0.43671 |  0:11:59s


#### Save model

In [None]:
saved_filename = tabnet.save_model(f'checkpoints/tabnet_{AUGMENTATION}.zip')

Successfully saved model at checkpoints/tabnet_none.zip


### Model Evaluation

In [None]:
predictions = tabnet.predict(X_test_enc.values)

#### Metrics

In [None]:
accuracy = metrics.accuracy_score(y_test, predictions)
precision_w = metrics.precision_score(y_test, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test, predictions, average='weighted')
precision_m = metrics.precision_score(y_test, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test, predictions, average='macro')

Model Evaluation Metrics
~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: 0.83
Precision (macro): 0.88
Recall (macro): 0.30
F1 (macro): 0.22
~~~~~~~~~~~~~~~~~~~~~~~~~


In [None]:
# Create dictionary for results
results = {
    "model": "Decision Tree",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

#### Save Metrics Results 

In [None]:
# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/tabnet.csv')

#### Confusion matrix

In [None]:
conf_mat = metrics.confusion_matrix(y_test, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                           index = attack_labels, 
                           columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,0,0,0,0,4782,0,0,0,0,0,0,0,0,0,0
DDoS_HTTP,0,0,0,0,0,0,0,0,0,0,0,0,0,9628,0
DDoS_ICMP,0,0,0,0,13501,0,0,0,0,0,0,0,0,0,0
DDoS_TCP,0,0,0,0,10009,0,0,0,0,0,0,0,0,0,0
DDoS_UDP,0,0,0,0,24601,0,0,0,0,0,0,0,0,0,0
Fingerprinting,0,0,0,0,146,0,0,0,0,0,0,0,0,0,0
MITM,0,0,0,0,0,0,33,0,0,0,0,0,0,43,0
Normal,0,0,0,0,0,0,0,272775,0,0,0,0,0,1,0
Password,0,0,0,0,0,0,0,0,0,0,0,10108,0,0,0
Port_Scanning,0,0,0,0,4062,0,0,0,0,0,0,0,0,0,0
