In [None]:
!pip install pytorch-tabnet

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
from copy import deepcopy

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

FRIDAY_PATH = "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
WEDNESDAY_PATH = "Wednesday-workingHours.pcap_ISCX.csv"

df_fri = pd.read_csv(FRIDAY_PATH)
df_wed = pd.read_csv(WEDNESDAY_PATH)

data = pd.concat([df_fri, df_wed], ignore_index=True)
print(f"Combined shape: {data.shape}")

#Data Prepocessing
data.columns = data.columns.str.strip()
if 'Label' not in data.columns:
    raise ValueError("Expected 'Label' column not found.")

print("Unique labels before binary encoding:")
print(data['Label'].unique())

data['Label'] = data['Label'].astype(str).str.strip()
data['Label'] = data['Label'].apply(lambda x: 0 if x == "BENIGN" else 1)

print("\nBinary label distribution (0=BENIGN, 1=ATTACK):")
print(data['Label'].value_counts())

data = data.apply(pd.to_numeric, errors='coerce')
data.replace([np.inf, -np.inf], np.nan, inplace=True)
before = data.shape[0]
data.dropna(inplace=True)
after = data.shape[0]

print(f"\nDropped {before - after} rows due to NaN/inf values.")
print(f"Remaining rows: {after}")

zero_var_cols = [c for c in data.columns if c != 'Label' and data[c].nunique() <= 1]
if zero_var_cols:
    print("Dropping zero-variance columns:", zero_var_cols)
    data.drop(columns=zero_var_cols, inplace=True)

feature_names = [c for c in data.columns if c != 'Label']
X = data[feature_names].values
y = data['Label'].values

print(f"\nFinal feature count: {len(feature_names)}")
print(f"Final dataset size  : {data.shape[0]} rows")

#Model Training
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=SEED, stratify=y_temp
)

print(f"Shapes -> Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

#Small hyperparamater search
base_params = {
    "n_d": 16,
    "n_a": 16,
    "n_steps": 5,
    "gamma": 1.5,
    "n_independent": 2,
    "n_shared": 2,
    "momentum": 0.02,
    "mask_type": "entmax",
    "lambda_sparse": 1e-4,
    "optimizer_fn": torch.optim.Adam,
    "optimizer_params": dict(lr=2e-3),
    "scheduler_params": {"step_size": 20, "gamma": 0.95},
    "scheduler_fn": torch.optim.lr_scheduler.StepLR,
    "verbose": 0,
}

param_grid = [
    {
        "name": "baseline_literature",
        "n_d": 16, "n_a": 16, "n_steps": 5,
        "gamma": 1.5,
        "lambda_sparse": 1e-4,
        "lr": 2e-3,
        "mask_type": "entmax",
    },
    {
        "name": "shallower_faster",
        "n_d": 16, "n_a": 16, "n_steps": 3,
        "gamma": 1.5,
        "lambda_sparse": 1e-4,
        "lr": 2e-3,
        "mask_type": "entmax",
    },
    {
        "name": "more_sparse_regularized",
        "n_d": 16, "n_a": 16, "n_steps": 5,
        "gamma": 1.5,
        "lambda_sparse": 1e-3,
        "lr": 2e-3,
        "mask_type": "entmax",
    },
    {
        "name": "sparsemax_masks",
        "n_d": 16, "n_a": 16, "n_steps": 5,
        "gamma": 1.5,
        "lambda_sparse": 1e-4,
        "lr": 2e-3,
        "mask_type": "sparsemax",
    },
    {
        "name": "higher_lr_quick_convergence",
        "n_d": 16, "n_a": 16, "n_steps": 5,
        "gamma": 1.5,
        "lambda_sparse": 1e-4,
        "lr": 3e-3,
        "mask_type": "entmax",
    },
]

best_val_acc = -1.0
best_name = None
best_params = None
best_model = None

for i, pg in enumerate(param_grid, 1):
    print(f"\n=== Config {i}/{len(param_grid)}: {pg['name']} ===")

    params = deepcopy(base_params)
    params["n_d"] = pg["n_d"]
    params["n_a"] = pg["n_a"]
    params["n_steps"] = pg["n_steps"]
    params["gamma"] = pg["gamma"]
    params["lambda_sparse"] = pg["lambda_sparse"]
    params["mask_type"] = pg["mask_type"]
    params["optimizer_params"] = dict(lr=pg["lr"])

    clf_tmp = TabNetClassifier(**params)

    clf_tmp.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_name=["train", "valid"],
        eval_metric=["accuracy"],
        max_epochs=80,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128,
        num_workers=0,
        drop_last=False,
    )

    valid_acc_hist = clf_tmp.history["valid_accuracy"]
    best_epoch = int(np.argmax(valid_acc_hist))
    val_acc = valid_acc_hist[best_epoch]

    print(f"Best val accuracy for {pg['name']}: {val_acc:.6f} at epoch {best_epoch}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_name = pg["name"]
        best_params = params
        best_model = clf_tmp

print("\n=== Best configuration selected ===")
print("Name :", best_name)
print("Params:", best_params)
print(f"Best validation accuracy: {best_val_acc:.6f}")

clf = best_model

# Evaluation
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
far = fp / (fp + tn) if (fp + tn) > 0 else 0.0

print("\n=== TabNet – Test Metrics (Binary: 0=BENIGN, 1=ATTACK) ===")
print(f"Accuracy : {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"FAR      : {far:.6f}  (False Alarm Rate = FP / (FP + TN))")

plt.figure(figsize=(5, 4))
sns.heatmap(
    cm, annot=True, fmt="d", cmap="Blues")
plt.title("TabNet – Confusion Matrix (Binary)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

#Global Feature Importance
global_importance = clf.feature_importances_
feat_imp_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": global_importance
}).sort_values(by="Importance", ascending=False)

print("\nTop 10 global features:")
print(feat_imp_df.head(10))

plt.figure(figsize=(10, 6))
sns.barplot(data=feat_imp_df.head(15), x="Importance", y="Feature")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
X_cv = np.concatenate([X_train, X_val])
y_cv = np.concatenate([y_train, y_val])

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

cv_results = []
fold = 0

for train_idx, test_idx in skf.split(X_cv, y_cv):
    fold += 1
    X_tr, X_te = X_cv[train_idx], X_cv[test_idx]
    y_tr, y_te = y_cv[train_idx], y_cv[test_idx]
    X_tr2, X_va, y_tr2, y_va = train_test_split(
        X_tr, y_tr,
        test_size=0.2,
        random_state=SEED,
        stratify=y_tr
    )

    print(f"\n--- Fold {fold} ---")

    clf_fold = TabNetClassifier(**best_params)

    clf_fold.fit(
        X_train=X_tr2, y_train=y_tr2,
        eval_set=[(X_tr2, y_tr2), (X_va, y_va)],
        eval_name=["train", "valid"],
        eval_metric=["accuracy"],
        max_epochs=100,
        patience=10,
        batch_size=1024,
        virtual_batch_size=128,
        num_workers=0,
        drop_last=False,
    )

    y_pred_fold = clf_fold.predict(X_te)

    acc  = accuracy_score(y_te, y_pred_fold)
    prec = precision_score(y_te, y_pred_fold, zero_division=0)
    rec  = recall_score(y_te, y_pred_fold, zero_division=0)
    f1   = f1_score(y_te, y_pred_fold, zero_division=0)

    tn, fp, fn, tp = confusion_matrix(y_te, y_pred_fold).ravel()
    far = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    print(
        f"Fold {fold} – "
        f"Acc:{acc:.4f}, Prec:{prec:.4f}, Rec:{rec:.4f}, "
        f"F1:{f1:.4f}, FAR:{far:.6f}"
    )

    cv_results.append([acc, prec, rec, f1, far])

cv_results = np.array(cv_results)
metrics = ["Accuracy", "Precision", "Recall", "F1-score", "FAR"]

print("\n=== Cross-Validation Summary (mean ± std) ===")
for i, m in enumerate(metrics):
    mean_val = cv_results[:, i].mean()
    std_val = cv_results[:, i].std()
    print(f"{m:<10}: {mean_val:.4f} ± {std_val:.44f}")

In [None]:
#Local + ClassConditional Explanations
M_explain_test, masks = clf.explain(X_test)

print("M_explain_test shape:", np.array(M_explain_test).shape)
print("Number of steps in masks:", len(masks))
print("One mask shape:", np.array(masks[0]).shape if len(masks) else None)

# --- Local explanations ---
y_pred = clf.predict(X_test)

tp_indices = np.where((y_pred == 1) & (y_test == 1))[0]
fp_indices = np.where((y_pred == 1) & (y_test == 0))[0]
fn_indices = np.where((y_pred == 0) & (y_test == 1))[0]

example_tp = int(tp_indices[0]) if len(tp_indices) else None
example_fp = int(fp_indices[0]) if len(fp_indices) else None
example_fn = int(fn_indices[0]) if len(fn_indices) else None

def top_k_from_vector(vec, names, k=10):
    vec = np.asarray(vec).reshape(-1)
    names = np.asarray(names).reshape(-1)
    order = np.argsort(np.abs(vec))[::-1][:k]
    return pd.DataFrame({"Feature": names[order], "Attribution": vec[order]})

# True Positive
if example_tp is not None:
    top_tp_df = top_k_from_vector(M_explain_test[example_tp], feature_names, k=10)
    print("\nLocal explanation – Example True Positive (Top 10):")
    print(top_tp_df)

# False Positive
if example_fp is not None:
    top_fp_df = top_k_from_vector(M_explain_test[example_fp], feature_names, k=10)
    print("\nLocal explanation – Example False Positive (Top 10):")
    print(top_fp_df)

#False Negative
if example_fn is not None:
    top_fn_df = top_k_from_vector(M_explain_test[example_fn], feature_names, k=10)
    print("\nLocal explanation – Example False Negative (Top 10):")
    print(top_fn_df)

#Class conditional
mean_pos = M_explain_test[y_test == 1].mean(axis=0)
mean_neg = M_explain_test[y_test == 0].mean(axis=0)

class_imp_df = pd.DataFrame({
    "Feature": feature_names,
    "Mean_Attribution_DoS_DDoS": mean_pos,
    "Mean_Attribution_Benign": mean_neg,
    "AbsDiff": np.abs(mean_pos - mean_neg)
}).sort_values("AbsDiff", ascending=False)

print("\nTop 10 class-conditional differences:")
print(class_imp_df.head(10))


In [None]:
## Improving the plots
sns.set_theme(style="whitegrid")

plt.rcParams["font.family"] = "Arial"
plt.rcParams["font.size"] = 10           
plt.rcParams["axes.titlesize"] = 11      
plt.rcParams["axes.labelsize"] = 10
plt.rcParams["xtick.labelsize"] = 9
plt.rcParams["ytick.labelsize"] = 9
plt.rcParams["legend.fontsize"] = 9


global_importance = clf.feature_importances_
feat_imp_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": global_importance
}).sort_values(by="Importance", ascending=False)

top_n = 10
top_feat = feat_imp_df.head(top_n) 

plt.figure(figsize=(8, 5))
ax = sns.barplot(
    data=top_feat,
    x="Importance",
    y="Feature",
    hue="Feature",       
    dodge=False,
    legend=False,
    palette="Blues_r"
)

ax.set_xlim(0, top_feat["Importance"].max() * 1.1)

sns.despine(top=True, right=True)

plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

if example_tp is not None:
    top_tp_df = top_k_from_vector(M_explain_test[example_tp], feature_names, k=10)
    top_tp_df = top_tp_df.sort_values(by="Attribution", ascending=False)

    print("\nLocal explanation – Example TP (Top 10):")
    print(top_tp_df)

    plt.figure(figsize=(8, 4))
    ax = sns.barplot(
        data=top_tp_df,
        x="Attribution",
        y="Feature",
        hue="Feature",
        dodge=False,
        legend=False,
        palette="Blues_r"
    )

    ax.set_xlim(0, top_tp_df["Attribution"].max() * 1.1)


    plt.xlabel("Attribution")
    plt.ylabel("Feature")

    sns.despine(top=True, right=True)
    plt.tight_layout()
    plt.show()

if example_fp is not None:
    top_fp_df = top_k_from_vector(M_explain_test[example_fp], feature_names, k=10)
    top_fp_df = top_fp_df.sort_values(by="Attribution", ascending=False)

    print("\nLocal explanation – Example FP (Top 10):")
    print(top_fp_df)

    plt.figure(figsize=(8, 4))
    ax = sns.barplot(
        data=top_fp_df,
        x="Attribution",
        y="Feature",
        hue="Feature",
        dodge=False,
        legend=False,
        palette="Blues_r"
    )

    ax.set_xlim(0, top_fp_df["Attribution"].max() * 1.1)


    plt.xlabel("Attribution")
    plt.ylabel("Feature")

    sns.despine(top=True, right=True)
    plt.tight_layout()
    plt.show()

top15_class = class_imp_df.head(10)

plt.figure(figsize=(8, 5))
ax = sns.barplot(
    data=top15_class,
    x="AbsDiff",
    y="Feature",
    hue="Feature",        
    dodge=False,
    legend=False,
    palette="Blues_r"
)


ax.set_xlim(0, top15_class["AbsDiff"].max() * 1.1)


sns.despine(top=True, right=True) 

plt.xlabel("Absolute difference in mean attribution")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

if example_fn is not None:
    top_fn_df = top_k_from_vector(M_explain_test[example_fn], feature_names, k=10)
    top_fn_df = top_fn_df.sort_values(by="Attribution", ascending=False)

    print("\nLocal explanation – Example FN (Top 10):")
    print(top_fn_df)

    plt.figure(figsize=(8, 4))
    ax = sns.barplot(
        data=top_fn_df,
        x="Attribution",
        y="Feature",
        hue="Feature",
        dodge=False,
        legend=False,
        palette="Blues_r"
    )

    ax.set_xlim(0, top_fn_df["Attribution"].max() * 1.1)

    plt.xlabel("Attribution")
    plt.ylabel("Feature")

    sns.despine(top=True, right=True)
    plt.tight_layout()
    plt.show()