### Libraries

In [69]:
%%capture
%reset -f                        # clear all variables from the workspace
'generic imports'
import os
import pandas as pd
import sys
sys.path.append(os.path.abspath('..'))
from src import utils   
import datetime        

'machine learning imports'
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### GPU

In [70]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using {}".format(DEVICE))

Using cpu


### Load data

In [71]:
data_dir = os.path.abspath('../data')

# Non-augmented dataset
df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k.csv'), low_memory=False)
AUGMENTATION = 'None'

# SMOTE augmented dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_SMOTE.csv'), low_memory=False)
# AUGMENTATION = 'SMOTE'

# SMOTE-NC augmented dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_SMOTE_NC.csv'), low_memory=False)
# AUGMENTATION = 'SMOTE-NC'

# RealTabFormer augmentation dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_RealTabFormer.csv'), low_memory=False)
# AUGMENTATION = 'RealTabFormer'

# GReaT augmentation dataset
# df_train = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_train_100k_GReaT.csv'), low_memory=False)
# AUGMENTATION = 'GReaT'


# Test data for all datasets
df_test = pd.read_csv(os.path.join(data_dir, 'EdgeIIot_test.csv'), low_memory=False)

### Data preparation

In [72]:
# drop columns mbtcp.unit_id and mbtcp.trans_id from train and test data    
df_train = df_train.drop(['mbtcp.unit_id', 'mbtcp.trans_id'], axis=1)
df_test = df_test.drop(['mbtcp.unit_id', 'mbtcp.trans_id'], axis=1)

# Creates X_train, y_train
X_train = df_train.drop(['Attack_label', 'Attack_type'], axis=1)
y_train = df_train['Attack_type']

# Creates X_test, y_test
X_test = df_test.drop(['Attack_label', 'Attack_type'], axis=1)
y_test = df_test['Attack_type']

# Extract categorical features
categorical_features = X_train.select_dtypes(include="object").columns

# Extract indices of categorical features
cat_idxs = [X_train.columns.get_loc(col) for col in categorical_features]

# Find number of unique values in each categorical column
cat_dims = [len(X_train[col].unique()) for col in categorical_features]

#### One-hot encoding categorical features

In [77]:
# Concatenate X_train and X_test to get all possible values for categorical columns
X_comb = pd.concat([X_train[categorical_features], X_test[categorical_features]], axis=0)

# Apply one-hot encoding (get_dummies)
X_comb_enc = pd.get_dummies(X_comb, columns=categorical_features)

# Split X_comb_enc into X_train_enc and X_test_enc
X_train_enc = X_comb_enc.iloc[:X_train.shape[0], :]
X_test_enc = X_comb_enc.iloc[X_train.shape[0]:, :]

# # Drop categorical columns from X_train and X_test
# X_train = X_train.drop(categorical_features, axis=1)
# X_test = X_test.drop(categorical_features, axis=1)

# # Concatenate X_train and X_test with X_train_enc and X_test_enc and drop index column
# X_train = pd.concat([X_train.reset_index(drop=True), X_train_enc.reset_index(drop=True)], axis=1)
# X_test = pd.concat([X_test.reset_index(drop=True), X_test_enc.reset_index(drop=True)], axis=1)

# Print the shape of X_train and X_test
print(f'X_train_enc shape: {X_train_enc.shape}, X_test_enc shape: {X_test_enc.shape}')

X_train_enc shape: (536515, 53), X_test_enc shape: (381934, 53)


In [None]:
# # AQUI Categorical columns in df_train
# categorical_columns = [f for f in features if f in df_train.select_dtypes(include="object").columns]

# # Concatenate X_train and X_test
# X_comb = pd.concat([X_train[categorical_columns], X_test[categorical_columns]], axis=0)

# # Apply one-hot encoding (get_dummies)
# X_comb_enc = pd.get_dummies(X_comb, dtype='int8')

# # Split back into X_train and X_test
# X_train_enc, X_test_enc = train_test_split(
#     X_comb_enc, test_size=len(X_test), random_state=42)

# # Print the shape of X_train_enc and X_test_enc
# print(f'X_train_enc shape: {X_train_enc.shape}, X_test_enc shape: {X_test_enc.shape}')


# # converts X_train and y_train to numpy arrays
# X_train = df_train[features]
# y_train = df_train["Attack_type"]

# # converts X_test and y_test to numpy arrays
# X_test = df_test[features]
# y_test = df_test["Attack_type"]

# # size of X_train, y_train  X_test, y_test
# print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
# print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')
# print(cat_idxs)

#### Label encoding

In [78]:
# instantiate the label encoder
le = LabelEncoder()

# fit and encode the training labels
y_train = le.fit_transform(y_train)

# encode the test labels
y_test = le.transform(y_test)

print('Attack_type and encoded labels:\n')
for i, label in enumerate(le.classes_):
    print(f'{label:23s} {i:d}')

Attack_type and encoded labels:

Backdoor                0
DDoS_HTTP               1
DDoS_ICMP               2
DDoS_TCP                3
DDoS_UDP                4
Fingerprinting          5
MITM                    6
Normal                  7
Password                8
Port_Scanning           9
Ransomware              10
SQL_injection           11
Uploading               12
Vulnerability_scanner   13
XSS                     14


### Model Training

In [57]:
# # embedding dimension for each categorical column
# cat_emb_dim = 10 

# # initialize embedder 
# cat_embedder = TabNetPretrainer(cat_dims, cat_emb_dim, cat_idxs)

# # instantiate TabNetClassifier model
# tabnet = TabNetClassifier(device_name = DEVICE,
#                           cat_idxs=cat_idxs,
#                           cat_dims=cat_dims,
#                           cat_emb_dim=cat_emb_dim)

In [68]:
# from X_train extract indices of categorical columns
cat_idxs

[6, 7, 8, 25, 31, 39, 40]

In [79]:
tabnet = TabNetClassifier(
    n_d=64, n_a=64, n_steps=5,
    gamma=1.5, n_independent=2, n_shared=2,
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=10,
    lambda_sparse=1e-4, momentum=0.3, clip_value=2.,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params = {"gamma": 0.95, "step_size": 20},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, epsilon=1e-15
)



In [81]:
tabnet.fit(X_train=X_train_enc.values, y_train=y_train,
           augmentations=None,
           max_epochs=100, 
           patience=10,
           batch_size=1024,
           virtual_batch_size=128
           )



In [45]:
NOTA: NÃO É nËCESSÄRIO APAGAR COLUNAS CATEGÓRICAS DE X_TRAIN E X_TEST SE SE IDENTIFICAREM OS ÍNDICES DAS COLUNAS CATEGÓRICAS E SE SE UTILIZAR O PARÂMETRO cat_idxs=cat_idxs NO MODELO TABNETCLASSIFIER
REVER OUTROS NOTEBOOKS!!!

array([4, 8, 4, ..., 7, 7, 7])

#### Save model

In [None]:
saved_filename = tabnet.save_model('checkpoints/tabnet')

### Model Evaluation

In [None]:
predictions = tabnet.predict(X_test)

#### Metrics

In [None]:
# Calculare and printe a nice board with precision, Recall, F1-score, AUC, Accuracy without classificaiton report
print("Model Evaluation Metrics")
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Accuracy: {}".format(metrics.accuracy_score(y_test, predictions)))
print("Precision: {}".format(metrics.precision_score(y_test, predictions, average='weighted')))
print("Recall: {}".format(metrics.recall_score(y_test, predictions, average='weighted')))
print("F1: {}".format(metrics.f1_score(y_test, predictions, average='weighted')))
print("AUC: {}".format(metrics.roc_auc_score(y_test, predictions, average='weighted')))
print("~~~~~~~~~~~~~~~~~~~~~~~~~")
print("\n")

#### Confusion matrix

In [None]:
conf_mat = metrics.confusion_matrix(y_test, predictions)
# Attack_type: ['DDoS_UDP' 'Password' 'DDoS_TCP' 'Backdoor' 'DDoS_ICMP' 'Port_Scanning'
#  'Vulnerability_scanner' 'SQL_injection' 'DDoS_HTTP' 'Uploading' 'XSS'
#  'Ransomware' 'MITM' 'Fingerprinting' 'Normal']

# conf_mat_df = pd.DataFrame(conf_mat, index = ['DDoS_UDP', 'Password', 'DDoS_TCP', 'Backdoor', 'DDoS_ICMP', 'Port_Scanning', 'Vulnerability_scanner', 'SQL_injection', 'DDoS_HTTP', 'Uploading', 'XSS', 'Ransomware', 'MITM', 'Fingerprinting', 'Normal'], columns = ['DDoS_UDP', 'Password', 'DDoS_TCP', 'Backdoor', 'DDoS_ICMP', 'Port_Scanning', 'Vulnerability_scanner', 'SQL_injection', 'DDoS_HTTP', 'Uploading', 'XSS', 'Ransomware', 'MITM', 'Fingerprinting', 'Normal'])

conf_mat_df.index.name = 'Actual'
conf_mat_df.columns.name = 'Predicted'
print(conf_mat_df)

### Save Results

In [None]:
# create dictionary for results
results = {
    "model": "TabNet",
    "augmentations": AUGMENTATION,
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "accuracy": metrics.accuracy_score(y_test, predictions),
    "precision": metrics.precision_score(y_test, predictions, average='weighted'),
    "recall": metrics.recall_score(y_test, predictions, average='weighted'),
    "f1": metrics.f1_score(y_test, predictions, average='weighted'),
    "auc": metrics.roc_auc_score(y_test, predictions, average='weighted')
    }

# save results to csv   
utils.save_results([results], 'results/TabNet.csv')