In [None]:
# ============================================================
# TON-IoT Network Intrusion Detection
#     Model: ExtraTrees + Gemma Embeddings
# ============================================================

# ─────────────────────────────────────────────────────────────
# 0. Imports
# ─────────────────────────────────────────────────────────────
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.ensemble import RandomForestClassifier
import torch
import time
from google.colab import drive
from huggingface_hub import login

# ─────────────────────────────────────────────────────────────
# 1. Mount Google Drive
# ─────────────────────────────────────────────────────────────
drive.mount('/content/drive')

# ─────────────────────────────────────────────────────────────
# 2. Load TON-IoT dataset
# ─────────────────────────────────────────────────────────────
ton_path = "/content/drive/MyDrive/Research Project/train_test_network.csv"
ton = pd.read_csv(ton_path)
ton.columns = [c.strip().lower().replace(" ", "_") for c in ton.columns]

print("TON-IoT columns:", list(ton.columns)[:10], "... total:", len(ton.columns))

# ─────────────────────────────────────────────────────────────
# 3. Binary labels
# ─────────────────────────────────────────────────────────────
if "label" in ton.columns:
    ton["label"] = (ton["label"] != 0).astype(int)
elif "type" in ton.columns:
    ton["label"] = ton["type"].apply(lambda x: 0 if str(x).lower() == "normal" else 1)
else:
    raise ValueError("TON-IoT must have 'label' or 'type' column")

print("Label distribution:\n", ton["label"].value_counts())

# ─────────────────────────────────────────────────────────────
# 4. Stratified randomized 80/20 train-test split
# ─────────────────────────────────────────────────────────────
ton_train, ton_test = train_test_split(
    ton, test_size=0.2, random_state=42, stratify=ton["label"]
)
print(f"Train: {len(ton_train)}  |  Test: {len(ton_test)}")

# ─────────────────────────────────────────────────────────────
# 5. Feature engineering
# ─────────────────────────────────────────────────────────────
for df in [ton_train, ton_test]:
    if "src_bytes" in df.columns and "dst_bytes" in df.columns:
        df["bytes_ratio"] = df["src_bytes"] / (df["dst_bytes"] + 1)
        df["log_src_bytes"] = np.log1p(df["src_bytes"])
        df["log_dst_bytes"] = np.log1p(df["dst_bytes"])
    if "duration" in df.columns:
        df["log_dur"] = np.log1p(df["duration"])
    if "src_pkts" in df.columns and "dst_pkts" in df.columns:
        df["pkt_ratio"] = df["src_pkts"] / (df["dst_pkts"] + 1)

# ─────────────────────────────────────────────────────────────
# 6. Encode categorical features
# ─────────────────────────────────────────────────────────────
categorical_cols = [c for c in ["proto", "service"] if c in ton_train.columns]
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([ton_train[col].astype(str), ton_test[col].astype(str)], axis=0)
    le.fit(combined)
    ton_train[col] = le.transform(ton_train[col].astype(str))
    ton_test[col]  = le.transform(ton_test[col].astype(str))

# ─────────────────────────────────────────────────────────────
# 7. Numeric columns
# ─────────────────────────────────────────────────────────────
drop_cols = ["label", "type", "src_ip", "dst_ip", "date_time"]
num_cols = [c for c in ton_train.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(ton_train[c])]

X_train_num = ton_train[num_cols].values
X_test_num  = ton_test[num_cols].values
y_train = ton_train["label"].values
y_test  = ton_test["label"].values

print(f"Numeric features used: {len(num_cols)}")

# ─────────────────────────────────────────────────────────────
# 8. Flow summaries for Gemma
# ─────────────────────────────────────────────────────────────
def make_summary(row):
    g = row.get
    srcbytes = g("src_bytes", 0)
    dstbytes = g("dst_bytes", 0)
    dur      = g("duration", 0.0)
    proto    = g("proto", "NA")
    service  = g("service", "NA")
    srcpkts  = g("src_pkts", 0)
    dstpkts  = g("dst_pkts", 0)
    return (f"Flow: src_bytes={srcbytes}, dst_bytes={dstbytes}, "
            f"duration={float(dur):.2f}s, proto={proto}, service={service}, "
            f"src_pkts={srcpkts}, dst_pkts={dstpkts}")

train_summaries = ton_train.apply(make_summary, axis=1).tolist()
test_summaries  = ton_test.apply(make_summary, axis=1).tolist()

# ─────────────────────────────────────────────────────────────
# 9. Load Gemma model
# ─────────────────────────────────────────────────────────────
login(token="your token hugging face")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "google/gemma-3-270m"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name, use_auth_token=True, output_hidden_states=True
).to(device)
model.eval()

# ─────────────────────────────────────────────────────────────
# 10. Embedding function
# ─────────────────────────────────────────────────────────────
class SummaryDataset(Dataset):
    def __init__(self, summaries): self.summaries = summaries
    def __len__(self): return len(self.summaries)
    def __getitem__(self, idx): return self.summaries[idx]

def collate_fn(batch):
    return tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)

def compute_embeddings(summaries, batch_size=32):
    ds = SummaryDataset(summaries)
    loader = DataLoader(ds, batch_size=batch_size, collate_fn=collate_fn)
    all_embs = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Embedding with Gemma"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            last_hidden = outputs.hidden_states[-1]
            pooled = last_hidden[:, 0, :].cpu().numpy()
            all_embs.append(pooled)
    return np.vstack(all_embs)

# ─────────────────────────────────────────────────────────────
# 11. Compute embeddings
# ─────────────────────────────────────────────────────────────
t0 = time.perf_counter()
train_emb = compute_embeddings(train_summaries)
t1 = time.perf_counter()
print(f"Train embedding time/sample: {(t1 - t0)/len(train_summaries)*1000:.2f} ms")

t0 = time.perf_counter()
test_emb = compute_embeddings(test_summaries)
t1 = time.perf_counter()
print(f"Test embedding time/sample: {(t1 - t0)/len(test_summaries)*1000:.2f} ms")

train_emb = normalize(train_emb, axis=1)
test_emb  = normalize(test_emb, axis=1)

# ─────────────────────────────────────────────────────────────
# 12. Combine numeric + embedding features
# ─────────────────────────────────────────────────────────────
X_train = np.hstack([X_train_num, train_emb])
X_test  = np.hstack([X_test_num, test_emb])
print("Final training shape:", X_train.shape)

# ─────────────────────────────────────────────────────────────
# 13. Train & Evaluate Random Forest
# ─────────────────────────────────────────────────────────────
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42
)

print("\nTraining Random Forest model...")
t0 = time.perf_counter()
clf.fit(X_train, y_train)
t1 = time.perf_counter()
print(f"Training time: {(t1 - t0):.2f}s")

# ─────────────────────────────────────────────────────────────
# 14. Evaluation
# ─────────────────────────────────────────────────────────────
y_pred = clf.predict(X_test)

print("\n Performance Report (Gemma + Random Forest):")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["normal", "attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

def save_confusion_matrix(cm, run_name, out_dir="/content/drive/MyDrive/Results/TON-IoT/Random Forest"):
    os.makedirs(out_dir, exist_ok=True)

    np.savetxt(os.path.join(out_dir, f"{run_name}_cm.csv"), cm, fmt="%d", delimiter=",")

    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"Confusion Matrix - {run_name}")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks([0,1]); ax.set_xticklabels(["Normal","Attack"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["Normal","Attack"])

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i,j], ha="center", va="center", color="red")

    fig.colorbar(im, ax=ax)
    fig.tight_layout()
    fig.savefig(os.path.join(out_dir, f"{run_name}_cm.png"))
    plt.close(fig)


In [None]:
import json
results = evaluate_model(clf, X_test, y_test)

cm = np.array(results["ConfusionMatrix"])
save_confusion_matrix(cm, run_name="Gemma_Results")

with open("/content/drive/MyDrive/Results/TON-IoT/Random Forest/Gemma_Results.txt", "w") as f:
    json.dump(results, f, indent=2)