In [None]:
# ─────────────────────────────────────────────────────────────
# 0. Imports
# ─────────────────────────────────────────────────────────────
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.preprocessing import LabelEncoder, normalize
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import ExtraTreesClassifier
import torch
import time
from google.colab import drive

# ─────────────────────────────────────────────────────────────
# 1. Mount Google Drive
# ─────────────────────────────────────────────────────────────
drive.mount('/content/drive')

# ─────────────────────────────────────────────────────────────
# 2. Load NSL-KDD train/test
# ─────────────────────────────────────────────────────────────
train_path = "/content/drive/MyDrive/Research Project/KDDTrain+.txt"
test_path  = "/content/drive/MyDrive/Research Project/KDDTest+.txt"

columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
    "num_compromised","root_shell","su_attempted","num_root","num_file_creations",
    "num_shells","num_access_files","num_outbound_cmds","is_host_login",
    "is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
    "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
    "srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "attack_type","difficulty"
]

train_df = pd.read_csv(train_path, header=None, names=columns)
test_df  = pd.read_csv(test_path,  header=None, names=columns)

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

# ─────────────────────────────────────────────────────────────
# 3. Binary labels
# ─────────────────────────────────────────────────────────────
train_df["label"] = (train_df["attack_type"] != "normal").astype(int)
test_df["label"]  = (test_df["attack_type"]  != "normal").astype(int)

# ─────────────────────────────────────────────────────────────
# 4. Encode categorical features safely
# ─────────────────────────────────────────────────────────────
categorical_cols = ["protocol_type", "service", "flag"]
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col].astype(str), test_df[col].astype(str)], axis=0)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col]  = le.transform(test_df[col].astype(str))

# ─────────────────────────────────────────────────────────────
# 5. Select numeric columns
# ─────────────────────────────────────────────────────────────
drop_cols = ["attack_type", "difficulty", "label"]
num_cols = [c for c in train_df.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(train_df[c])]
X_train_num = train_df[num_cols].values
X_test_num  = test_df[num_cols].values
y_train = train_df["label"].values
y_test  = test_df["label"].values
print(f"Using {len(num_cols)} numeric features")

# ─────────────────────────────────────────────────────────────
# 6. Flow-style summaries for embedding
# ─────────────────────────────────────────────────────────────
def make_summary(row):
    g = row.get
    proto  = g("protocol_type","NA")
    serv   = g("service","NA")
    flag   = g("flag","NA")
    src_b  = g("src_bytes",0)
    dst_b  = g("dst_bytes",0)
    dur    = g("duration",0.0)
    serror = g("serror_rate",0.0)
    rerror = g("rerror_rate",0.0)
    count  = g("count",0)
    srv_ct = g("srv_count",0)
    return (f"Connection: proto={proto}, service={serv}, flag={flag}, "
            f"duration={dur:.2f}s, src_bytes={src_b}, dst_bytes={dst_b}, "
            f"serror_rate={serror:.2f}, rerror_rate={rerror:.2f}, "
            f"count={count}, srv_count={srv_ct}")

train_summaries = train_df.apply(make_summary, axis=1).tolist()
test_summaries  = test_df.apply(make_summary, axis=1).tolist()
print("Example summary:", train_summaries[0][:120], "...")

# ─────────────────────────────────────────────────────────────
# 7. Load BGE model
# ─────────────────────────────────────────────────────────────
print("\nLoading BGE model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
bge_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device=device)

# ─────────────────────────────────────────────────────────────
# 8. Compute embeddings
# ─────────────────────────────────────────────────────────────
def compute_bge_embeddings(texts, batch_size=32):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding with BGE"):
        batch = texts[i:i+batch_size]
        emb = bge_model.encode(batch, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
        all_embs.append(emb)
    return np.vstack(all_embs)

t0 = time.perf_counter()
train_emb = compute_bge_embeddings(train_summaries)
t1 = time.perf_counter()
print(f"Train embedding time/sample: {(t1 - t0)/len(train_summaries)*1000:.2f} ms")

t0 = time.perf_counter()
test_emb = compute_bge_embeddings(test_summaries)
t1 = time.perf_counter()
print(f"Test embedding time/sample: {(t1 - t0)/len(test_summaries)*1000:.2f} ms")

# ─────────────────────────────────────────────────────────────
# 9. Combine numeric + BGE embeddings
# ─────────────────────────────────────────────────────────────
X_train = np.hstack([X_train_num, train_emb])
X_test  = np.hstack([X_test_num,  test_emb])
print("Final training shape:", X_train.shape)

# ─────────────────────────────────────────────────────────────
# 10. Train & Evaluate Extra Trees
# ─────────────────────────────────────────────────────────────
clf = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=5,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

print("\nTraining Extra Trees model...")
t0 = time.perf_counter()
clf.fit(X_train, y_train)
t1 = time.perf_counter()
print(f"Training time: {(t1 - t0):.2f}s")

# ─────────────────────────────────────────────────────────────
# 11. Evaluation
# ─────────────────────────────────────────────────────────────
y_pred = clf.predict(X_test)
print("\nPerformance Report (BGE + ExtraTrees):")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["normal","attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train shape: (125973, 43), Test shape: (22544, 43)
Using 41 numeric features
Example summary: Connection: proto=1, service=20, flag=9, duration=0.00s, src_bytes=491, dst_bytes=0, serror_rate=0.00, rerror_rate=0.00, ...

Loading BGE model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Embedding with BGE: 100%|██████████| 3937/3937 [01:05<00:00, 60.45it/s]


Train embedding time/sample: 0.52 ms


Embedding with BGE: 100%|██████████| 705/705 [00:11<00:00, 60.77it/s]


Test embedding time/sample: 0.52 ms
Final training shape: (125973, 425)

Training Extra Trees model...
Training time: 25.56s

Performance Report (BGE + ExtraTrees on NSL-KDD):
Accuracy : 0.815
Precision: 0.969
Recall   : 0.697
F1 Score : 0.811

Classification Report:
               precision    recall  f1-score   support

      normal       0.71      0.97      0.82      9711
      attack       0.97      0.70      0.81     12833

    accuracy                           0.81     22544
   macro avg       0.84      0.83      0.81     22544
weighted avg       0.86      0.81      0.81     22544

Confusion Matrix:
 [[9421  290]
 [3889 8944]]


In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000  # ms per sample

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix + FPR
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

def save_confusion_matrix(cm, run_name, out_dir="/content/drive/MyDrive/Results/NSB/ExtraTreesClassifier"):
    os.makedirs(out_dir, exist_ok=True)
    # Save CSV
    np.savetxt(os.path.join(out_dir, f"{run_name}_cm.csv"), cm, fmt="%d", delimiter=",")

    # Save image
    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"Confusion Matrix - {run_name}")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks([0,1]); ax.set_xticklabels(["Normal","Attack"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["Normal","Attack"])

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i,j], ha="center", va="center", color="red")

    fig.colorbar(im, ax=ax)
    fig.tight_layout()
    fig.savefig(os.path.join(out_dir, f"{run_name}_cm.png"))
    plt.close(fig)


In [None]:
import json

results = evaluate_model(clf, X_test, y_test)

cm = np.array(results["ConfusionMatrix"])
save_confusion_matrix(cm, run_name="BAAI_DT")

with open("/content/drive/MyDrive/Results/NSB/ExtraTreesClassifier/BAAI_Results.txt", "w") as f:
    json.dump(results, f, indent=2)
