In [None]:
# ─────────────────────────────────────────────────────────────
# 0. Imports
# ─────────────────────────────────────────────────────────────
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.preprocessing import LabelEncoder, normalize
from torch.utils.data import Dataset, DataLoader
from sklearn.tree import DecisionTreeClassifier
from sentence_transformers import SentenceTransformer
import torch
import time
from google.colab import drive

# ─────────────────────────────────────────────────────────────
# 1. Mount Google Drive
# ─────────────────────────────────────────────────────────────
drive.mount('/content/drive')

# ─────────────────────────────────────────────────────────────
# 2. Load UNSW-NB15 train/test
# ─────────────────────────────────────────────────────────────
unsw_train_path = "/content/drive/MyDrive/Research Project/UNSW_NB15_training-set.csv"
unsw_test_path  = "/content/drive/MyDrive/Research Project/UNSW_NB15_testing-set.csv"

unsw_train = pd.read_csv(unsw_train_path)
unsw_test  = pd.read_csv(unsw_test_path)

print("UNSW-NB15 columns:", list(unsw_train.columns)[:10], "... total:", len(unsw_train.columns))

# ─────────────────────────────────────────────────────────────
# 3. Binary labels
# ─────────────────────────────────────────────────────────────
if "label" not in unsw_train.columns:
    def _binlab(df):
        if "attack_cat" in df.columns:
            return (df["attack_cat"].astype(str).str.lower() != "normal").astype(int)
        raise ValueError("Missing 'label' or 'attack_cat'")
    unsw_train["label"] = _binlab(unsw_train)
    unsw_test["label"]  = _binlab(unsw_test)
else:
    unsw_train["label"] = (unsw_train["label"] != 0).astype(int)
    unsw_test["label"]  = (unsw_test["label"]  != 0).astype(int)

# ─────────────────────────────────────────────────────────────
# 4. Feature engineering
# ─────────────────────────────────────────────────────────────
for df in [unsw_train, unsw_test]:
    if "sbytes" in df.columns and "dbytes" in df.columns:
        df["bytes_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
        df["log_sbytes"] = np.log1p(df["sbytes"])
        df["log_dbytes"] = np.log1p(df["dbytes"])
    if "dur" in df.columns:
        df["log_dur"] = np.log1p(df["dur"])
    if "spkts" in df.columns and "dpkts" in df.columns:
        df["pkt_ratio"] = df["spkts"] / (df["dpkts"] + 1)

# ─────────────────────────────────────────────────────────────
# 5. Encode categorical features
# ─────────────────────────────────────────────────────────────
categorical_cols = [c for c in ["proto", "service", "state"] if c in unsw_train.columns]
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([unsw_train[col].astype(str), unsw_test[col].astype(str)], axis=0)
    le.fit(combined)
    unsw_train[col] = le.transform(unsw_train[col].astype(str))
    unsw_test[col]  = le.transform(unsw_test[col].astype(str))

# ─────────────────────────────────────────────────────────────
# 6. Numeric columns
# ─────────────────────────────────────────────────────────────
drop_cols = ["srcip", "dstip", "label", "attack_cat"]
num_cols = [c for c in unsw_train.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(unsw_train[c])]

X_train_num = unsw_train[num_cols].values
X_test_num  = unsw_test[num_cols].values
y_train = unsw_train["label"].values
y_test  = unsw_test["label"].values

# ─────────────────────────────────────────────────────────────
# 7. Flow summaries
# ─────────────────────────────────────────────────────────────
def make_summary(row):
    g = row.get
    srcbytes = g("sbytes", g("srcbytes", 0))
    dstbytes = g("dbytes", g("dstbytes", 0))
    dur      = g("dur", 0.0)
    proto    = g("proto", "NA")
    service  = g("service", "NA")
    state    = g("state", "NA")
    sport    = g("sport", g("sport", -1))
    dsport   = g("dsport", g("dsport", -1))
    return (f"Flow: src_bytes={srcbytes}, dst_bytes={dstbytes}, "
            f"duration={float(dur):.2f}s, proto={proto}, service={service}, "
            f"state={state}, sport={sport}, dsport={dsport}")

train_summaries = unsw_train.apply(make_summary, axis=1).tolist()
test_summaries  = unsw_test.apply(make_summary, axis=1).tolist()

# ─────────────────────────────────────────────────────────────
# 8. Load BGE model
# ─────────────────────────────────────────────────────────────
print("\nLoading BGE model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
bge_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

# ─────────────────────────────────────────────────────────────
# 9. Compute embeddings
# ─────────────────────────────────────────────────────────────
def compute_bge_embeddings(texts, batch_size=32):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding with Mini_LM"):
        batch = texts[i:i+batch_size]
        emb = bge_model.encode(batch, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
        all_embs.append(emb)
    return np.vstack(all_embs)

t0 = time.perf_counter()
train_emb = compute_bge_embeddings(train_summaries)
t1 = time.perf_counter()
print(f"Train embedding time/sample: {(t1 - t0)/len(train_summaries)*1000:.2f} ms")

t0 = time.perf_counter()
test_emb = compute_bge_embeddings(test_summaries)
t1 = time.perf_counter()
print(f"Test embedding time/sample: {(t1 - t0)/len(test_summaries)*1000:.2f} ms")

# ─────────────────────────────────────────────────────────────
# 10. Combine numeric + MiniLM embeddings
# ─────────────────────────────────────────────────────────────
X_train = np.hstack([X_train_num, train_emb])
X_test  = np.hstack([X_test_num, test_emb])
print("Final training shape:", X_train.shape)

# ─────────────────────────────────────────────────────────────
# 11. Train & Evaluate Decision Tree
# ─────────────────────────────────────────────────────────────
clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)

print("\nTraining Decision Trees model...")
t0 = time.perf_counter()
clf.fit(X_train, y_train)
t1 = time.perf_counter()
print(f"Training time: {(t1 - t0):.2f}s")

# ─────────────────────────────────────────────────────────────
# 12. Evaluation
# ─────────────────────────────────────────────────────────────
y_pred = clf.predict(X_test)
print("\n Performance Report (MiniLM + Decision Trees):")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["normal", "attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
UNSW-NB15 columns: ['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate'] ... total: 45

Loading BGE model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Embedding with Mini_LM: 100%|██████████| 5480/5480 [00:52<00:00, 103.96it/s]


Train embedding time/sample: 0.30 ms


Embedding with Mini_LM: 100%|██████████| 2573/2573 [00:24<00:00, 106.99it/s]


Test embedding time/sample: 0.29 ms
Final training shape: (175341, 432)

Training Extra Trees model...
Training time: 38.12s

 Performance Report (MiniLM + ExtraTrees):
Accuracy : 0.519
Precision: 0.603
Recall   : 0.368
F1 Score : 0.457

Classification Report:
               precision    recall  f1-score   support

      normal       0.48      0.70      0.57     37000
      attack       0.60      0.37      0.46     45332

    accuracy                           0.52     82332
   macro avg       0.54      0.54      0.51     82332
weighted avg       0.55      0.52      0.51     82332

Confusion Matrix:
 [[26005 10995]
 [28635 16697]]


In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000  # ms per sample

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix + FPR
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

def save_confusion_matrix(cm, run_name, out_dir="/content/drive/MyDrive/Results/UNSW/DecisionTree"):
    os.makedirs(out_dir, exist_ok=True)
    # Save CSV
    np.savetxt(os.path.join(out_dir, f"{run_name}_cm.csv"), cm, fmt="%d", delimiter=",")

    # Save image
    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"Confusion Matrix - {run_name}")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks([0,1]); ax.set_xticklabels(["Normal","Attack"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["Normal","Attack"])

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i,j], ha="center", va="center", color="red")

    fig.colorbar(im, ax=ax)
    fig.tight_layout()
    fig.savefig(os.path.join(out_dir, f"{run_name}_cm.png"))
    plt.close(fig)


In [None]:
import json

results = evaluate_model(clf, X_test, y_test)

cm = np.array(results["ConfusionMatrix"])
save_confusion_matrix(cm, run_name="MiniLM_DT")

with open("/content/drive/MyDrive/Results/UNSW/DecisionTree/MiniLM_Results.txt", "w") as f:
    json.dump(results, f, indent=2)
