In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
from sklearn.ensemble import RandomForestClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ─────────────────────────────────────────────────────────────
# 1. Load Dataset
# ─────────────────────────────────────────────────────────────
#HF_TOKEN   = os.getenv("HF_TOKEN", "your token hugging face")
train_df = pd.read_csv("/content/drive/MyDrive/Research Project/KDDTrain+.txt", header=None)
test_df  = pd.read_csv("/content/drive/MyDrive/Research Project/KDDTest+.txt",  header=None)

columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
    "num_compromised","root_shell","su_attempted","num_root","num_file_creations",
    "num_shells","num_access_files","num_outbound_cmds","is_host_login",
    "is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
    "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
    "srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "attack_type","difficulty"
]

train_df.columns = columns
test_df.columns  = columns

train_df["label"] = (train_df["attack_type"] != "normal").astype(int)
test_df["label"]  = (test_df["attack_type"]  != "normal").astype(int)

In [None]:
# ─────────────────────────────────────────────────────────────
# 2. Extract Numeric
# ─────────────────────────────────────────────────────────────
drop_cols = ["protocol_type", "service", "flag", "attack_type", "difficulty"]
feature_cols = [c for c in train_df.columns if c not in drop_cols + ["label"]]

X_train_num = train_df[feature_cols].values
X_test_num  = test_df[feature_cols].values
y_train     = train_df["label"].values
y_test      = test_df["label"].values

def make_summary(row):
    return (
        f"Flow: src_bytes={row.src_bytes}, dst_bytes={row.dst_bytes}, "
        f"duration={row.duration}s, count={row.count}, flag={row.flag}"
    )

train_summaries = train_df.apply(make_summary, axis=1).tolist()
test_summaries  = test_df.apply(make_summary, axis=1).tolist()

In [None]:
from huggingface_hub import login
login(token="your token hugging face")

In [None]:
# ─────────────────────────────────────────────────────────────
# 3. Load Gemma Model
# ─────────────────────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_name = "google/gemma-3-270m"
tokenizer = AutoTokenizer.from_pretrained(model_name,use_auth_token=HF_TOKEN)
model     = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=HF_TOKEN,output_hidden_states=True).to(device)
model.eval()

In [None]:
# ─────────────────────────────────────────────────────────────
# 4. Dataset + Embedding Function
# ─────────────────────────────────────────────────────────────
class SummaryDataset(Dataset):
    def __init__(self, summaries):
        self.summaries = summaries
    def __len__(self):
        return len(self.summaries)
    def __getitem__(self, idx):
        return self.summaries[idx]

def collate_fn(batch):
    return tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    )

def compute_embeddings(summaries, batch_size=32):
    ds = SummaryDataset(summaries)
    loader = DataLoader(ds, batch_size=batch_size, collate_fn=collate_fn)
    all_embs = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Embedding with Gemma"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            last_hidden = outputs.hidden_states[-1]
            pooled = last_hidden.mean(dim=1).cpu().numpy()
            all_embs.append(pooled)
    return np.vstack(all_embs)

In [None]:
# Training embeddings timing
t0 = time.perf_counter()
train_emb = compute_embeddings(train_summaries)
t1 = time.perf_counter()
embed_time_train = (t1 - t0) / len(train_summaries) * 1000

# Test embeddings timing
t0 = time.perf_counter()
test_emb  = compute_embeddings(test_summaries)
t1 = time.perf_counter()
embed_time_test = (t1 - t0) / len(test_summaries) * 1000

print(f"\nEmbedding time per sample (train): {embed_time_train:.2f} ms")
print(f"Embedding time per sample (test): {embed_time_test:.2f} ms")

In [None]:
# ─────────────────────────────────────────────────────────────
# 5. Generate Embeddings
# ─────────────────────────────────────────────────────────────
# print("Generating Gemma embeddings for training set...")
# train_emb = compute_embeddings(train_summaries)

# print("Generating Gemma embeddings for test set...")
# test_emb  = compute_embeddings(test_summaries)

In [None]:
# ─────────────────────────────────────────────────────────────
# 6. Combine Embeddings with Numeric Features
# ─────────────────────────────────────────────────────────────
X_train = np.hstack([X_train_num, train_emb])
X_test  = np.hstack([X_test_num, test_emb])

In [None]:
# ─────────────────────────────────────────────────────────────
# 7. Train & Evaluate Classifier
# ─────────────────────────────────────────────────────────────
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\n Performance Report (Gemma + Random Forest):")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["normal", "attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix + FPR
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results

In [None]:
import json
results = evaluate_model(clf, X_test, y_test)
with open("/content/drive/MyDrive/Results/Random Forest/Gemma_Results.txt", "w") as f:
    json.dump(results, f, indent=2)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

def save_confusion_matrix(cm, run_name, out_dir="/content/drive/MyDrive/Results/Random Forest"):
    os.makedirs(out_dir, exist_ok=True)

    np.savetxt(os.path.join(out_dir, f"{run_name}_cm.csv"), cm, fmt="%d", delimiter=",")

    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"Confusion Matrix - {run_name}")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks([0,1]); ax.set_xticklabels(["Normal","Attack"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["Normal","Attack"])

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i,j], ha="center", va="center", color="red")

    fig.colorbar(im, ax=ax)
    fig.tight_layout()
    fig.savefig(os.path.join(out_dir, f"{run_name}_cm.png"))
    plt.close(fig)


In [None]:
results = evaluate_model(clf, X_test, y_test)

cm = np.array(results["ConfusionMatrix"])
save_confusion_matrix(cm, run_name="Gemma_Results")

with open("/content/drive/MyDrive/Results/Random Forest/Gemma_Results.txt", "w") as f:
    json.dump(results, f, indent=2)
