In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
import torch
import time
from transformers import LlamaTokenizer, LlamaModel
from sklearn.ensemble import RandomForestClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 1. Configuration
# ──────────────────────────────────────────────────────────────────────────────
HF_TOKEN   = os.getenv("HF_TOKEN", "your token hugging face")
SEED       = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/Research Project/UNSW_NB15_training-set.csv")
test_df  = pd.read_csv("/content/drive/MyDrive/Research Project/UNSW_NB15_testing-set.csv")

In [None]:
print("UNSW-NB15 columns:", list(train_df.columns)[:10], "... (total:", len(train_df.columns), ")")

# ──────────────────────────────────────────────────────────────────────────────
# B. Ensure binary label
# ──────────────────────────────────────────────────────────────────────────────
if "label" not in train_df.columns:
    def _binlab(df):
        if "attack_cat" in df.columns:
            return (df["attack_cat"].astype(str).str.lower() != "normal").astype(int)
        raise ValueError("UNSW-NB15 needs either 'label' or 'attack_cat' to build a binary label.")
    train_df["label"] = _binlab(train_df)
    test_df["label"]  = _binlab(test_df)
else:
    train_df["label"] = (train_df["label"] != 0).astype(int)
    test_df["label"]  = (test_df["label"]  != 0).astype(int)

In [None]:
# 4) Select numeric features only
categorical_cols_unsw = [c for c in ["proto","service","state","attack_cat","srcip","dstip"]
                         if c in train_df.columns]
drop_cols_unsw = categorical_cols_unsw + ["label"]

num_cols_unsw = [c for c in train_df.columns
                 if c not in drop_cols_unsw and pd.api.types.is_numeric_dtype(train_df[c])]

X_train_num = train_df[num_cols_unsw].values
X_test_num  = test_df[num_cols_unsw].values
y_train     = train_df["label"].values
y_test      = test_df["label"].values


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# D. Flow summaries for UNSW
# ──────────────────────────────────────────────────────────────────────────────
def make_summary(row):
    g = row.get
    srcbytes = g("sbytes", g("srcbytes", 0))
    dstbytes = g("dbytes", g("dstbytes", 0))
    dur      = g("dur", 0.0)
    proto    = g("proto", "NA")
    service  = g("service", "NA")
    state    = g("state", "NA")
    sport    = g("sport", g("sport", -1))
    dsport   = g("dsport", g("dsport", -1))

    return (f"Flow: src_bytes={srcbytes}, dst_bytes={dstbytes}, "
            f"duration={float(dur):.2f}s, proto={proto}, service={service}, "
            f"state={state}, sport={sport}, dsport={dsport}")

train_summaries = train_df.apply(make_summary, axis=1).tolist()
test_summaries  = test_df.apply(make_summary, axis=1).tolist()

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 3. Load LLaMA-2 & Set Pad Token
# ──────────────────────────────────────────────────────────────────────────────
tokenizer = LlamaTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    use_auth_token=HF_TOKEN
)
tokenizer.pad_token = tokenizer.eos_token

model = LlamaModel.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    use_auth_token=HF_TOKEN,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    output_hidden_states=True
).to(device)
model.eval()

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 4. Embedding Function
# ──────────────────────────────────────────────────────────────────────────────
class SummaryDataset(Dataset):
    def __init__(self, summaries):
        self.summaries = summaries
    def __len__(self):
        return len(self.summaries)
    def __getitem__(self, idx):
        return self.summaries[idx]

def collate_fn(batch):
    return tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    )

def compute_embeddings(summaries):
    ds     = SummaryDataset(summaries)
    loader = DataLoader(ds, batch_size=32, collate_fn=collate_fn)
    all_embs = []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            last_hidden = outputs.hidden_states[-1]
            embs = last_hidden.mean(dim=1).cpu().numpy()
            all_embs.append(embs)
    return np.vstack(all_embs)

print("################ Training Embedding ################")
t0 = time.perf_counter()
train_emb = compute_embeddings(train_summaries)
t1 = time.perf_counter()
embed_time_train = (t1 - t0) / len(train_summaries) * 1000
print("################ Testing Embedding ################")
t0 = time.perf_counter()
test_emb  = compute_embeddings(test_summaries)
t1 = time.perf_counter()
embed_time_test = (t1 - t0) / len(test_summaries) * 1000


In [None]:
print("Train size:", len(train_df))
print("Test size :", len(test_df))

In [None]:
print(f"Embedding time per sample (train): {embed_time_train:.2f} ms")
print(f"Embedding time per sample (test): {embed_time_test:.2f} ms")

In [None]:
# 9) Augment numeric features with embeddings
X_train = np.hstack([X_train_num, train_emb])
X_test  = np.hstack([X_test_num,  test_emb])

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 5. Train & Evaluate Random Forest
# ──────────────────────────────────────────────────────────────────────────────
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\nPerformance on Test Set:")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")

In [None]:
print("Classification Report:\n",
      classification_report(y_test, y_pred, target_names=["normal","attack"]))

In [None]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results


In [None]:
import json
results = evaluate_model(clf, X_test, y_test)
with open("/content/drive/MyDrive/Results/UNSW/Random Forest/LLama_Results.txt", "w") as f:
    json.dump(results, f, indent=2)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os

def save_confusion_matrix(cm, run_name, out_dir="/content/drive/MyDrive/Results/UNSW/Random Forest"):
    os.makedirs(out_dir, exist_ok=True)

    np.savetxt(os.path.join(out_dir, f"{run_name}_cm.csv"), cm, fmt="%d", delimiter=",")

    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.set_title(f"Confusion Matrix - {run_name}")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks([0,1]); ax.set_xticklabels(["Normal","Attack"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["Normal","Attack"])

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i,j], ha="center", va="center", color="red")

    fig.colorbar(im, ax=ax)
    fig.tight_layout()
    fig.savefig(os.path.join(out_dir, f"{run_name}_cm.png"))
    plt.close(fig)


In [None]:
results = evaluate_model(clf, X_test, y_test)

cm = np.array(results["ConfusionMatrix"])
save_confusion_matrix(cm, run_name="LLama_DT")

with open("/content/drive/MyDrive/Results/UNSW/Random Forest/LLama_Results.txt", "w") as f:
    json.dump(results, f, indent=2)
