### Libraries

In [50]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import datetime
import numpy as np
import sys
sys.path.append(os.path.abspath('..'))
from src import utils
import importlib
importlib.reload(utils)        

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

### GPU

In [51]:
DEVICE = 'CUDA' if torch.cuda.is_available() else 'CPU'
print("Using {}".format(DEVICE))

# Info on the device available memory
if DEVICE == 'CUDA':
    gpu = torch.device('cuda')
    total_memory = torch.cuda.get_device_properties(gpu).total_memory / 1024**3
    current_memory = torch.cuda.memory_allocated(gpu) / 1024**3

    print(f'Total GPU memory: {total_memory:.1f} GB | Current usage: {current_memory:.1f} GB')

Using CPU


### Load data

In [52]:
AUGMENTATION = 'None'

data_dir = os.path.abspath('../data')

# Load the train and test datasets
df_train, df_test = utils.load_dataset(data_directory=data_dir, 
                                       augmentation=AUGMENTATION, 
                                       ignore_columns=['mbtcp.unit_id', 
                                                       'mbtcp.trans_id']) 

Loading complete.
Train data: 536515 rows, 46 columns. 
Test data: 381934 rows, 46 columns.


### Data preparation

In [53]:
# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

#### Convert categorical features to one-hot encoded features

In [54]:
# Encode the training and test labels if needed
X_train_enc, X_test_enc, cat_cols, cat_dims = utils.encode_categorical(X_train, X_test, encoding='label')

Categorical features to be encoded:

dns.qry.name.len
mqtt.conack.flags
http.request.version
http.request.method
mqtt.topic
http.referer
mqtt.protoname

Encoding complete.
No of features before encoding: 44
No of features after encoding: 44


In [55]:
X_train_enc.columns

Index(['arp.opcode', 'arp.hw.size', 'icmp.checksum', 'icmp.seq_le',
       'icmp.unused', 'http.content_length', 'http.request.method',
       'http.referer', 'http.request.version', 'http.response',
       'http.tls_port', 'tcp.ack', 'tcp.ack_raw', 'tcp.checksum',
       'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn',
       'tcp.connection.synack', 'tcp.flags', 'tcp.flags.ack', 'tcp.len',
       'tcp.seq', 'udp.stream', 'udp.time_delta', 'dns.qry.name',
       'dns.qry.name.len', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission',
       'dns.retransmit_request', 'dns.retransmit_request_in',
       'mqtt.conack.flags', 'mqtt.conflag.cleansess', 'mqtt.conflags',
       'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mqtt.msgtype',
       'mqtt.proto_len', 'mqtt.protoname', 'mqtt.topic', 'mqtt.topic_len',
       'mqtt.ver', 'mbtcp.len'],
      dtype='object')

In [58]:
# valid = False

# if valid == True:
#     def get_unique_values(df, columns):
#         unique_values = df[columns].apply(lambda x: ''.join(x.astype(str)), axis=1).unique()
#         unique_values.sort()
#         return unique_values

#     http_referer_columns = [col for col in X_train.columns if 'http.referer' in col]
#     http_request_version_columns = [col for col in X_train.columns if 'http.request.version' in col]
#     dns_qry_name_len_columns = [col for col in X_train.columns if 'dns.qry.name.len' in col]
#     mqtt_conack_flags_columns = [col for col in X_train.columns if 'mqtt.conack.flags' in col]
#     mqtt_protoname_columns = [col for col in X_train.columns if 'mqtt.protoname' in col]
#     http_request_method_columns = [col for col in X_train.columns if 'http.request.method' in col]

#     unique_values_http_referer = get_unique_values(X_train, http_referer_columns)
#     unique_values_http_request_version = get_unique_values(X_train, http_request_version_columns)
#     unique_values_dns_qry_name_len = get_unique_values(X_train, dns_qry_name_len_columns)
#     unique_values_mqtt_conack_flags = get_unique_values(X_train, mqtt_conack_flags_columns)
#     unique_values_mqtt_protoname = get_unique_values(X_train, mqtt_protoname_columns)

#     # Apply the same function to X_test
#     unique_values_http_referer_test = get_unique_values(X_test, http_referer_columns)
#     unique_values_http_request_version_test = get_unique_values(X_test, http_request_version_columns)
#     unique_values_dns_qry_name_len_test = get_unique_values(X_test, dns_qry_name_len_columns)
#     unique_values_mqtt_conack_flags_test = get_unique_values(X_test, mqtt_conack_flags_columns)
#     unique_values_mqtt_protoname_test = get_unique_values(X_test, mqtt_protoname_columns)
    
#     # Join the unique values from both train and test data and print them
#     unique_values_http_referer = np.unique(np.concatenate((unique_values_http_referer, unique_values_http_referer_test)))
#     unique_values_http_request_version = np.unique(np.concatenate((unique_values_http_request_version, unique_values_http_request_version_test)))
#     unique_values_dns_qry_name_len = np.unique(np.concatenate((unique_values_dns_qry_name_len, unique_values_dns_qry_name_len_test)))
#     unique_values_mqtt_conack_flags = np.unique(np.concatenate((unique_values_mqtt_conack_flags, unique_values_mqtt_conack_flags_test)))
#     unique_values_mqtt_protoname = np.unique(np.concatenate((unique_values_mqtt_protoname, unique_values_mqtt_protoname_test)))

#     print(f'Unique values for http.referer: \n{unique_values_http_referer}\n')
#     print(f'Unique values for http.request.version: \n{unique_values_http_request_version}\n')
#     print(f'Unique values for dns.qry.name.len: \n{unique_values_dns_qry_name_len}\n')
#     print(f'Unique values for mqtt.conack.flags: \n{unique_values_mqtt_conack_flags}\n')
#     print(f'Unique values for mqtt.protoname: \n{unique_values_mqtt_protoname}\n')
    
#     # check if X_train and X_test have categorical features
#     print(f'X_train categorical features: {X_train.select_dtypes(include="object").columns}')
#     print(f'X_test categorical features: {X_test.select_dtypes(include="object").columns}')

In [59]:
# # AQUI Categorical columns in df_train
# categorical_columns = [f for f in features if f in df_train.select_dtypes(include="object").columns]

# # Concatenate X_train and X_test
# X_comb = pd.concat([X_train[categorical_columns], X_test[categorical_columns]], axis=0)

# # Apply one-hot encoding (get_dummies)
# X_comb_enc = pd.get_dummies(X_comb, dtype='int8')

# # Split back into X_train and X_test
# X_train_enc, X_test_enc = train_test_split(
#     X_comb_enc, test_size=len(X_test), random_state=42)

# # Print the shape of X_train_enc and X_test_enc
# print(f'X_train_enc shape: {X_train_enc.shape}, X_test_enc shape: {X_test_enc.shape}')


# # converts X_train and y_train to numpy arrays
# X_train = df_train[features]
# y_train = df_train["Attack_type"]

# # converts X_test and y_test to numpy arrays
# X_test = df_test[features]
# y_test = df_test["Attack_type"]

# # size of X_train, y_train  X_test, y_test
# print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
# print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')
# print(cat_idxs)

#### Label encoding

In [60]:
y_train_enc, y_test_enc, le = utils.encode_labels(y_train, y_test)

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model Training

In [61]:
# Shuffle training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

if AUGMENTATION == 'SMOTE'or AUGMENTATION == 'SMOTE-NC':
    # pytorch_tabnet default parameters
    tabnet = TabNetClassifier()
    
    tabnet.fit(X_train=X_train_enc.values, 
               y_train=y_train_enc,
               augmentations=None,
               max_epochs=100,
               )
else: # AUGMENTATION == 'None', 'RealTabFormer', 'GReaT'

    cat_idxs = [ i for i, f in enumerate(X_train_enc.columns) if f in cat_cols]
    cat_dims = [ cat_dims[f] for i, f in enumerate(X_train_enc.columns) if f in cat_dims]

    tabnet = TabNetClassifier(cat_idxs=cat_idxs,
                              cat_dims=cat_dims,
                              cat_emb_dim=10,    # categorical features embedding dimension
                              )
    tabnet.fit(X_train=X_train_enc.values, 
               y_train=y_train_enc,
               augmentations=None,
               max_epochs=100,
               )



epoch 0  | loss: 0.56117 |  0:00:24s
epoch 1  | loss: 0.40527 |  0:00:48s
epoch 2  | loss: 0.38657 |  0:01:13s
epoch 3  | loss: 0.3736  |  0:01:38s
epoch 4  | loss: 0.36308 |  0:02:05s
epoch 5  | loss: 0.33899 |  0:02:30s
epoch 6  | loss: 0.32266 |  0:02:56s
epoch 7  | loss: 0.32742 |  0:03:21s
epoch 8  | loss: 0.32761 |  0:03:47s
epoch 9  | loss: 0.3219  |  0:04:12s
epoch 10 | loss: 0.3194  |  0:04:36s
epoch 11 | loss: 0.31666 |  0:05:02s
epoch 12 | loss: 0.31499 |  0:05:26s
epoch 13 | loss: 0.31924 |  0:05:50s
epoch 14 | loss: 0.33445 |  0:06:14s
epoch 15 | loss: 0.31655 |  0:06:38s
epoch 16 | loss: 0.31622 |  0:07:01s
epoch 17 | loss: 0.31345 |  0:07:25s
epoch 18 | loss: 0.3128  |  0:07:49s
epoch 19 | loss: 0.31225 |  0:08:13s
epoch 20 | loss: 0.31113 |  0:08:37s
epoch 21 | loss: 0.31141 |  0:09:00s
epoch 22 | loss: 0.30943 |  0:09:24s
epoch 23 | loss: 0.32371 |  0:09:47s
epoch 24 | loss: 0.3125  |  0:10:11s
epoch 25 | loss: 0.31088 |  0:10:35s
epoch 26 | loss: 0.31137 |  0:10:58s
e

#### Save model

In [66]:
saved_filename = tabnet.save_model(f'checkpoints/tabnet/tabnet_{AUGMENTATION}')

Successfully saved model at checkpoints/tabnet/tabnet_None.zip


### Model Evaluation

In [67]:
predictions = tabnet.predict(X_test_enc.values)

#### Metrics

In [68]:
accuracy = metrics.accuracy_score(y_test_enc, predictions)
precision_w = metrics.precision_score(y_test_enc, predictions, average='weighted', zero_division=1)
recall_w = metrics.recall_score(y_test_enc, predictions, average='weighted')
f1_score_w = metrics.f1_score(y_test_enc, predictions, average='weighted')
precision_m = metrics.precision_score(y_test_enc, predictions, average='macro', zero_division=1)
recall_m = metrics.recall_score(y_test_enc, predictions, average='macro')
f1_score_m = metrics.f1_score(y_test_enc, predictions, average='macro')

In [69]:
# Create dictionary for results
results = {
    "model": "Decision Tree",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": accuracy,
    "precision_macro": precision_m,
    "recall_macro": recall_m,
    "f1_macro": f1_score_m,
    "precision_weighted": precision_w,
    "recall_weighted": recall_w,
    "f1_weighted": f1_score_w
    }

utils.print_results_table(results)

╒══════════════════════╤═════════╕
│ Metric               │ Value   │
╞══════════════════════╪═════════╡
│ Accuracy             │ 88.76%  │
├──────────────────────┼─────────┤
│ Precision (macro)    │ 80.70%  │
├──────────────────────┼─────────┤
│ Recall (macro)       │ 66.16%  │
├──────────────────────┼─────────┤
│ F1 (macro)           │ 63.71%  │
├──────────────────────┼─────────┤
│ Precision (weighted) │ 94.96%  │
├──────────────────────┼─────────┤
│ Recall (weighted)    │ 88.76%  │
├──────────────────────┼─────────┤
│ F1 (weighted)        │ 88.20%  │
╘══════════════════════╧═════════╛


#### Save Metrics Results 

In [70]:
# save results to csv   
utils.save_results_to_csv([results], '../results/metrics/tabnet.csv')

#### Confusion matrix

In [72]:
conf_mat = metrics.confusion_matrix(y_test_enc, predictions)

attack_labels = ['Backdoor', 'DDoS_HTTP', 'DDoS_ICMP', 'DDoS_TCP', 'DDoS_UDP', 
'Fingerprinting', 'MITM', 'Normal', 'Password', 'Port_Scanning', 'Ransomware', 
'SQL_injection', 'Uploading', 'Vulnerability_scanner', 'XSS']

# Create a dataframe from the confusion matrix
conf_mat_df = pd.DataFrame(conf_mat, 
                           index = attack_labels, 
                           columns = attack_labels)
conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'

# Save the confusion matrix
conf_mat_df.to_csv(f"../results/conf_matrix/{results['model']}_{results['augmentations']}.csv")
conf_mat_df

Predicted,Backdoor,DDoS_HTTP,DDoS_ICMP,DDoS_TCP,DDoS_UDP,Fingerprinting,MITM,Normal,Password,Port_Scanning,Ransomware,SQL_injection,Uploading,Vulnerability_scanner,XSS
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Backdoor,4527,0,0,83,0,0,0,0,0,172,0,0,0,0,0
DDoS_HTTP,0,8992,0,0,0,0,0,0,0,0,0,0,0,143,493
DDoS_ICMP,0,0,13224,0,0,0,0,0,0,277,0,0,0,0,0
DDoS_TCP,0,0,0,10009,0,0,0,0,0,0,0,0,0,0,0
DDoS_UDP,0,0,0,0,2285,0,0,0,0,22316,0,0,0,0,0
Fingerprinting,0,0,0,23,0,0,0,0,0,122,0,0,1,0,0
MITM,1,0,0,0,0,0,67,0,0,0,4,0,4,0,0
Normal,0,0,0,1,0,0,0,272768,0,0,0,0,6,0,1
Password,0,0,0,0,0,0,0,0,1306,0,0,7394,1169,239,0
Port_Scanning,0,0,0,2027,0,0,0,0,0,2035,0,0,0,0,0
