In [None]:
# ─────────────────────────────────────────────────────────────
# 0. Imports
# ─────────────────────────────────────────────────────────────
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, normalize
from imblearn.combine import SMOTEENN
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
from google.colab import drive
from huggingface_hub import login

# ─────────────────────────────────────────────────────────────
# 1. Mount Google Drive
# ─────────────────────────────────────────────────────────────
drive.mount('/content/drive')

# ─────────────────────────────────────────────────────────────
# 2. Load UNSW-NB15 train/test
# ─────────────────────────────────────────────────────────────
unsw_train_path = "/content/drive/MyDrive/Research Project/UNSW_NB15_training-set.csv"
unsw_test_path  = "/content/drive/MyDrive/Research Project/UNSW_NB15_testing-set.csv"

unsw_train = pd.read_csv(unsw_train_path)
unsw_test  = pd.read_csv(unsw_test_path)

print("UNSW-NB15 columns:", list(unsw_train.columns)[:10], "... total:", len(unsw_train.columns))

# ─────────────────────────────────────────────────────────────
# 3. Binary labels
# ─────────────────────────────────────────────────────────────
if "label" not in unsw_train.columns:
    def _binlab(df):
        if "attack_cat" in df.columns:
            return (df["attack_cat"].astype(str).str.lower() != "normal").astype(int)
        raise ValueError("Missing 'label' or 'attack_cat'")
    unsw_train["label"] = _binlab(unsw_train)
    unsw_test["label"]  = _binlab(unsw_test)
else:
    unsw_train["label"] = (unsw_train["label"] != 0).astype(int)
    unsw_test["label"]  = (unsw_test["label"]  != 0).astype(int)

# ─────────────────────────────────────────────────────────────
# 4. Enhanced feature engineering
# ─────────────────────────────────────────────────────────────
for df in [unsw_train, unsw_test]:
    if "sbytes" in df.columns and "dbytes" in df.columns:
        df["bytes_ratio"] = df["sbytes"] / (df["dbytes"] + 1)
        df["log_sbytes"] = np.log1p(df["sbytes"])
        df["log_dbytes"] = np.log1p(df["dbytes"])
    if "dur" in df.columns:
        df["log_dur"] = np.log1p(df["dur"])
    if "spkts" in df.columns and "dpkts" in df.columns:
        df["pkt_ratio"] = df["spkts"] / (df["dpkts"] + 1)

# ─────────────────────────────────────────────────────────────
# 5. Encode categorical features safely
# ─────────────────────────────────────────────────────────────
categorical_cols = [c for c in ["proto", "service", "state"] if c in unsw_train.columns]
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([unsw_train[col].astype(str), unsw_test[col].astype(str)], axis=0)
    le.fit(combined)
    unsw_train[col] = le.transform(unsw_train[col].astype(str))
    unsw_test[col]  = le.transform(unsw_test[col].astype(str))

# ─────────────────────────────────────────────────────────────
# 6. Select numeric columns
# ─────────────────────────────────────────────────────────────
drop_cols = ["srcip", "dstip", "label", "attack_cat"]
num_cols = [c for c in unsw_train.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(unsw_train[c])]

X_train_num = unsw_train[num_cols].values
X_test_num  = unsw_test[num_cols].values
y_train = unsw_train["label"].values
y_test  = unsw_test["label"].values

# ─────────────────────────────────────────────────────────────
# 7. Text summarization for Gemma embeddings
# ─────────────────────────────────────────────────────────────
def make_summary(row):
    g = row.get
    srcbytes = g("sbytes", g("srcbytes", 0))
    dstbytes = g("dbytes", g("dstbytes", 0))
    dur      = g("dur", 0.0)
    proto    = g("proto", "NA")
    service  = g("service", "NA")
    state    = g("state", "NA")
    sport    = g("sport", g("sport", -1))
    dsport   = g("dsport", g("dsport", -1))
    return (f"Flow: src_bytes={srcbytes}, dst_bytes={dstbytes}, "
            f"duration={float(dur):.2f}s, proto={proto}, service={service}, "
            f"state={state}, sport={sport}, dsport={dsport}")

train_summaries = unsw_train.apply(make_summary, axis=1).tolist()
test_summaries  = unsw_test.apply(make_summary, axis=1).tolist()

# ─────────────────────────────────────────────────────────────
# 8. Load Gemma model
# ─────────────────────────────────────────────────────────────
login(token="your token hugging face")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "google/gemma-3-270m"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True, output_hidden_states=True).to(device)
model.eval()

# ─────────────────────────────────────────────────────────────
# 9. Embedding dataset + improved pooling
# ─────────────────────────────────────────────────────────────
class SummaryDataset(Dataset):
    def __init__(self, summaries):
        self.summaries = summaries
    def __len__(self):
        return len(self.summaries)
    def __getitem__(self, idx):
        return self.summaries[idx]

def collate_fn(batch):
    return tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)

def compute_embeddings(summaries, batch_size=32):
    ds = SummaryDataset(summaries)
    loader = DataLoader(ds, batch_size=batch_size, collate_fn=collate_fn)
    all_embs = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Embedding with Gemma"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            last_hidden = outputs.hidden_states[-1]
            pooled = last_hidden[:, 0, :].cpu().numpy()
            all_embs.append(pooled)
    return np.vstack(all_embs)

# ─────────────────────────────────────────────────────────────
# 10. Compute & normalize embeddings
# ─────────────────────────────────────────────────────────────
t0 = time.perf_counter()
train_emb = compute_embeddings(train_summaries)
t1 = time.perf_counter()
print(f"Train embedding time/sample: {(t1 - t0)/len(train_summaries)*1000:.2f} ms")

t0 = time.perf_counter()
test_emb = compute_embeddings(test_summaries)
t1 = time.perf_counter()
print(f"Test embedding time/sample: {(t1 - t0)/len(test_summaries)*1000:.2f} ms")

train_emb = normalize(train_emb, axis=1)
test_emb  = normalize(test_emb, axis=1)

# ─────────────────────────────────────────────────────────────
# 11. Combine features
# ─────────────────────────────────────────────────────────────
X_train = np.hstack([X_train_num, train_emb])
X_test  = np.hstack([X_test_num,  test_emb])

# ─────────────────────────────────────────────────────────────
# 12. Balance data
# ─────────────────────────────────────────────────────────────
smote_enn = SMOTEENN(random_state=42)
X_train, y_train = smote_enn.fit_resample(X_train, y_train)
print("After SMOTE+ENN:", np.bincount(y_train))

# ─────────────────────────────────────────────────────────────
# 13. Train & Evaluate Random Forest
# ─────────────────────────────────────────────────────────────
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# ─────────────────────────────────────────────────────────────
# 14. Performance report
# ─────────────────────────────────────────────────────────────
print("\n Performance Report (Gemma + RF Enhanced):")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["normal", "attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results

In [None]:
import json
results = evaluate_model(clf, X_test, y_test)
with open("/content/drive/MyDrive/Results/UNSW/Random Forest/Gemma_Results.txt", "w") as f:
    json.dump(results, f, indent=2)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

def save_confusion_matrix(cm, run_name, out_dir="/content/drive/MyDrive/Results/UNSW/Random Forest"):
    os.makedirs(out_dir, exist_ok=True)

    np.savetxt(os.path.join(out_dir, f"{run_name}_cm.csv"), cm, fmt="%d", delimiter=",")

    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"Confusion Matrix - {run_name}")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks([0,1]); ax.set_xticklabels(["Normal","Attack"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["Normal","Attack"])

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i,j], ha="center", va="center", color="red")

    fig.colorbar(im, ax=ax)
    fig.tight_layout()
    fig.savefig(os.path.join(out_dir, f"{run_name}_cm.png"))
    plt.close(fig)


In [None]:
results = evaluate_model(clf, X_test, y_test)

cm = np.array(results["ConfusionMatrix"])
save_confusion_matrix(cm, run_name="Gemma_Results")

with open("/content/drive/MyDrive/Results/UNSW/Random Forest/Gemma_Results.txt", "w") as f:
    json.dump(results, f, indent=2)
