In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

FRIDAY_PATH = "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
WEDNESDAY_PATH = "Wednesday-workingHours.pcap_ISCX.csv"

df_fri = pd.read_csv(FRIDAY_PATH)
df_wed = pd.read_csv(WEDNESDAY_PATH)

data = pd.concat([df_fri, df_wed], ignore_index=True)
print(f"Combined shape: {data.shape}")

#Data Prepocessing
data.columns = data.columns.str.strip()
if 'Label' not in data.columns:
    raise ValueError("Expected 'Label' column not found.")

print("Unique labels before binary encoding:")
print(data['Label'].unique())

data['Label'] = data['Label'].astype(str).str.strip()
data['Label'] = data['Label'].apply(lambda x: 0 if x == "BENIGN" else 1)

print("\nBinary label distribution (0=BENIGN, 1=ATTACK):")
print(data['Label'].value_counts())

data = data.apply(pd.to_numeric, errors='coerce')
data.replace([np.inf, -np.inf], np.nan, inplace=True)
before = data.shape[0]
data.dropna(inplace=True)
after = data.shape[0]

print(f"\nDropped {before - after} rows due to NaN/inf values.")
print(f"Remaining rows: {after}")

zero_var_cols = [c for c in data.columns if c != 'Label' and data[c].nunique() <= 1]
if zero_var_cols:
    print("Dropping zero-variance columns:", zero_var_cols)
    data.drop(columns=zero_var_cols, inplace=True)

feature_names = [c for c in data.columns if c != 'Label']
X = data[feature_names].values
y = data['Label'].values

print(f"\nFinal feature count: {len(feature_names)}")
print(f"Final dataset size  : {data.shape[0]} rows")

#Model Training
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=SEED, stratify=y_temp
)

print(f"Shapes -> Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
#Small hyperparamater search
xgb_base_params = {
    "objective": "binary:logistic",
    "random_state": SEED,
    "n_estimators": 300,
    "learning_rate": 0.1,
    "max_depth": 6,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "n_jobs": -1,
}

xgb_param_grid = [
    {
        "name": "baseline",
        "params": {
            "n_estimators": 300,
            "max_depth": 6,
            "learning_rate": 0.1,
            "subsample": 0.9,
            "colsample_bytree": 0.9,
        },
    },
    {
        "name": "deeper_more_trees",
        "params": {
            "n_estimators": 400,
            "max_depth": 8,
            "learning_rate": 0.1,
            "subsample": 0.9,
            "colsample_bytree": 0.9,
        },
    },
    {
        "name": "shallower_faster",
        "params": {
            "n_estimators": 200,
            "max_depth": 4,
            "learning_rate": 0.1,
            "subsample": 0.9,
            "colsample_bytree": 0.9,
        },
    },
    {
        "name": "lower_lr_more_trees",
        "params": {
            "n_estimators": 500,
            "max_depth": 6,
            "learning_rate": 0.05,
            "subsample": 0.9,
            "colsample_bytree": 0.9,
        },
    },
    {
        "name": "more_randomness",
        "params": {
            "n_estimators": 300,
            "max_depth": 6,
            "learning_rate": 0.1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
        },
    },
]

xgb_best_val_acc = -1.0
xgb_best_name = None
xgb_best_params = None
xgb_best_model = None

for i, cfg in enumerate(xgb_param_grid, 1):
    print(f"\n=== XGBoost Config {i}/{len(xgb_param_grid)}: {cfg['name']} ===")

    params = xgb_base_params.copy()
    params.update(cfg["params"])

    xgb_tmp = XGBClassifier(**params)

    xgb_tmp.fit(X_train, y_train)

    y_val_pred = xgb_tmp.predict(X_val)
    val_acc = accuracy_score(y_val, y_val_pred)

    print(f"Validation accuracy for {cfg['name']}: {val_acc:.6f}")

    if val_acc > xgb_best_val_acc:
        xgb_best_val_acc = val_acc
        xgb_best_name = cfg["name"]
        xgb_best_params = params
        xgb_best_model = xgb_tmp

print("\n=== Best XGBoost configuration selected ===")
print("Name :", xgb_best_name)
print("Params:", xgb_best_params)
print(f"Best validation accuracy: {xgb_best_val_acc:.6f}")

xgb_clf = xgb_best_model

y_pred_xgb = xgb_clf.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
prec_xgb = precision_score(y_test, y_pred_xgb, zero_division=0)
rec_xgb = recall_score(y_test, y_pred_xgb, zero_division=0)
f1_xgb = f1_score(y_test, y_pred_xgb, zero_division=0)

cm_xgb = confusion_matrix(y_test, y_pred_xgb)
tn, fp, fn, tp = cm_xgb.ravel()
far_xgb = fp / (fp + tn) if (fp + tn) > 0 else 0.0

print("\n=== XGBoost – Test Metrics (Binary: 0=BENIGN, 1=ATTACK) ===")
print(f"Accuracy : {acc_xgb:.4f}")
print(f"Precision: {prec_xgb:.4f}")
print(f"Recall   : {rec_xgb:.4f}")
print(f"F1-score : {f1_xgb:.4f}")
print(f"FAR      : {far_xgb:.6f}  (False Alarm Rate = FP / (FP + TN))")

plt.figure(figsize=(5, 4))
sns.heatmap(cm_xgb, annot=True, fmt="d", cmap="Blues")
plt.title("XGBoost – Confusion Matrix (Binary)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

#Global Feature Importance
xgb_importances = xgb_clf.feature_importances_
xgb_feat_imp_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": xgb_importances
}).sort_values(by="Importance", ascending=False)

print("\nTop 10 XGBoost features:")
print(xgb_feat_imp_df.head(10))

plt.figure(figsize=(10, 6))
sns.barplot(data=xgb_feat_imp_df.head(10), x="Importance", y="Feature")
plt.title("XGBoost – Top 10 Feature Importances")
plt.tight_layout()
plt.show()