In [None]:
# ─────────────────────────────────────────────────────────────
# 0. Imports
# ─────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)
from google.colab import drive
from tqdm import tqdm

# ─────────────────────────────────────────────────────────────
# 1. Mount Google Drive
# ─────────────────────────────────────────────────────────────
drive.mount('/content/drive')

# ─────────────────────────────────────────────────────────────
# 2. Load UNSW-NB15 train/test
# ─────────────────────────────────────────────────────────────
train_path = "/content/drive/MyDrive/Research Project/UNSW_NB15_training-set.csv"
test_path  = "/content/drive/MyDrive/Research Project/UNSW_NB15_testing-set.csv"

train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
print("Columns:", list(train_df.columns)[:10], "...")

# ─────────────────────────────────────────────────────────────
# 3. Label conversion
# ─────────────────────────────────────────────────────────────
if "label" not in train_df.columns:
    train_df["label"] = (train_df["attack_cat"].astype(str).str.lower() != "normal").astype(int)
    test_df["label"]  = (test_df["attack_cat"].astype(str).str.lower() != "normal").astype(int)
else:
    train_df["label"] = (train_df["label"] != 0).astype(int)
    test_df["label"]  = (test_df["label"]  != 0).astype(int)

# ─────────────────────────────────────────────────────────────
# 4. Basic feature engineering
# ─────────────────────────────────────────────────────────────
for df in [train_df, test_df]:
    if "sbytes" in df.columns and "dbytes" in df.columns:
        df["bytes_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
        df["log_sbytes"] = np.log1p(df["sbytes"])
        df["log_dbytes"] = np.log1p(df["dbytes"])
    if "dur" in df.columns:
        df["log_dur"] = np.log1p(df["dur"])
    if "spkts" in df.columns and "dpkts" in df.columns:
        df["pkt_ratio"] = df["spkts"] / (df["dpkts"] + 1)

# ─────────────────────────────────────────────────────────────
# 5. Encode categorical features safely
# ─────────────────────────────────────────────────────────────
categorical_cols = [c for c in ["proto", "service", "state"] if c in train_df.columns]
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col].astype(str), test_df[col].astype(str)], axis=0)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col]  = le.transform(test_df[col].astype(str))

# ─────────────────────────────────────────────────────────────
# 6. Select numeric columns
# ─────────────────────────────────────────────────────────────
drop_cols = ["srcip", "dstip", "attack_cat", "label"]
num_cols = [c for c in train_df.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(train_df[c])]
print(f"Using {len(num_cols)} numeric features")

X_train = train_df[num_cols].values
X_test  = test_df[num_cols].values
y_train = train_df["label"].values
y_test  = test_df["label"].values

# ─────────────────────────────────────────────────────────────
# 7. Normalize features
# ─────────────────────────────────────────────────────────────
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# ─────────────────────────────────────────────────────────────
# 8. Train ExtraTrees baseline
# ─────────────────────────────────────────────────────────────
print("\nTraining Decision Tree baseline...")
clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)
clf.fit(X_train, y_train)

# ─────────────────────────────────────────────────────────────
# 9. Evaluate
# ─────────────────────────────────────────────────────────────
y_pred = clf.predict(X_test)

print("\nBaseline Decision Performance:")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["normal", "attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train shape: (175341, 45), Test shape: (82332, 45)
Columns: ['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate'] ...
Using 48 numeric features

Training Decision Tree baseline...

Baseline Decision Performance:
Accuracy : 0.504
Precision: 0.572
Recall   : 0.395
F1 Score : 0.468

Classification Report:
               precision    recall  f1-score   support

      normal       0.46      0.64      0.54     37000
      attack       0.57      0.40      0.47     45332

    accuracy                           0.50     82332
   macro avg       0.52      0.52      0.50     82332
weighted avg       0.52      0.50      0.50     82332

Confusion Matrix:
 [[23601 13399]
 [27408 17924]]


In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000  # ms per sample

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix + FPR
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 6) Save to file (consistent with your NSL-KDD structure)
# ──────────────────────────────────────────────────────────────────────────────
import json
results = evaluate_model(clf, X_test, y_test)
with open("/content/drive/MyDrive/Results/UNSW/DecisionTree/Baseline_Decision_Tree_Results.txt", "w") as f:
    json.dump(results, f, indent=2)