### Libraries

In [20]:
'generic imports'
import os
import pandas as pd
import sys
sys.path.append(os.path.abspath('..'))
from src import utils

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn import metrics

### GPU

In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using {}".format(DEVICE))

Using cpu


### Load data

In [4]:
data_dir = os.path.abspath('../data')

# Non-augmented dataset
df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k.csv'), low_memory=False)
AUGMENTATION = 'None'

# SMOTE augmented dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_SMOTE.csv'), low_memory=False)
# AUGMENTATION = 'SMOTE'

# SMOTE-NC augmented dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_SMOTE_NC.csv'), low_memory=False)
# AUGMENTATION = 'SMOTE-NC'

# RealTabFormer augmentation dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_RealTabFormer.csv'), low_memory=False)
# AUGMENTATION = 'RealTabFormer'

# GReaT augmentation dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_GReaT.csv'), low_memory=False)
# AUGMENTATION = 'GReaT'


# Test data for all datasets
df_test = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_test.csv'), low_memory=False)

### Data preparation

In [None]:
# drop columns mbtcp.unit_id and mbtcp.trans_id from train and test data    
df_train = df_train.drop(['mbtcp.unit_id', 'mbtcp.trans_id'], axis=1)
df_test = df_test.drop(['mbtcp.unit_id', 'mbtcp.trans_id'], axis=1)

# extract features from df_train
features = [col for col in df_train.columns if col not in ["Attack_label"]+["Attack_type"]] 

# index of categorical features in df_train
cat_idxs = [i for i, f in enumerate(features) if f in df_train.select_dtypes(include="object").columns]

# number of unique values in each categorical column
cat_dims = [len(df_train[f].unique()) for i, f in enumerate(features) if f in df_train.select_dtypes(include="object").columns]

# converts X_train and y_train to numpy arrays
X_train = df_train[features].values
y_train = df_train["Attack_type"].values

# converts X_test and y_test to numpy arrays
X_test = df_test[features].values
y_test = df_test["Attack_type"].values

# size of X_train, y_train  X_test, y_test
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

### Model Training

In [7]:
# embedding dimension for each categorical column
cat_emb_dim = 10 

# initialize embedder 
cat_embedder = TabNetPretrainer(cat_dims, cat_emb_dim, cat_idxs)

# instantiate TabNetClassifier model
tabnet = TabNetClassifier(device_name = DEVICE)



In [None]:
tabnet.fit(X_train=X_train, y_train=y_train,
           augmentations=None,
           max_epochs=100, 
           patience=10,
           batch_size=1024,
           virtual_batch_size=128,
           cat_emb_dim=cat_emb_dim,
           cat_idxs=cat_idxs,
           cat_dims=cat_dims,
           pretraining_ratio=0.5,
           pretrainer=cat_embedder,
           )

#### Save model

In [None]:
saved_filename = tabnet.save_model('checkpoints/tabnet')

### Model Evaluation

In [None]:
predictions = tabnet.predict(X_test)

#### Metrics

In [None]:
# Calculare and printe a nice board with precision, Recall, F1-score, AUC, Accuracy without classificaiton report
print("Model Evaluation Metrics")
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Accuracy: {}".format(metrics.accuracy_score(y_test, predictions)))
print("Precision: {}".format(metrics.precision_score(y_test, predictions, average='weighted')))
print("Recall: {}".format(metrics.recall_score(y_test, predictions, average='weighted')))
print("F1: {}".format(metrics.f1_score(y_test, predictions, average='weighted')))
print("AUC: {}".format(metrics.roc_auc_score(y_test, predictions, average='weighted')))
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("\n")

#### Confusion matrix

In [None]:
conf_mat = metrics.confusion_matrix(y_test, predictions)
# Attack_type: ['DDoS_UDP' 'Password' 'DDoS_TCP' 'Backdoor' 'DDoS_ICMP' 'Port_Scanning'
#  'Vulnerability_scanner' 'SQL_injection' 'DDoS_HTTP' 'Uploading' 'XSS'
#  'Ransomware' 'MITM' 'Fingerprinting' 'Normal']

# conf_mat_df = pd.DataFrame(conf_mat, index = ['DDoS_UDP', 'Password', 'DDoS_TCP', 'Backdoor', 'DDoS_ICMP', 'Port_Scanning', 'Vulnerability_scanner', 'SQL_injection', 'DDoS_HTTP', 'Uploading', 'XSS', 'Ransomware', 'MITM', 'Fingerprinting', 'Normal'], columns = ['DDoS_UDP', 'Password', 'DDoS_TCP', 'Backdoor', 'DDoS_ICMP', 'Port_Scanning', 'Vulnerability_scanner', 'SQL_injection', 'DDoS_HTTP', 'Uploading', 'XSS', 'Ransomware', 'MITM', 'Fingerprinting', 'Normal'])

conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'
print(conf_mat_df)

### Save Results

In [None]:
# create dictionary for results
results = {
    "model": "TabNet",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": metrics.accuracy_score(y_test, predictions),
    "precision": metrics.precision_score(y_test, predictions, average='weighted'),
    "recall": metrics.recall_score(y_test, predictions, average='weighted'),
    "f1": metrics.f1_score(y_test, predictions, average='weighted'),
    "auc": metrics.roc_auc_score(y_test, predictions, average='weighted')
    }

# save results to csv   
utils.save_results([results], 'results/TabNet.csv')