In [1]:
# ============================================================
# TON-IoT Network Intrusion Detection
#     Model: ExtraTrees Baseline (Numeric + Encoded Features)
# ============================================================

# ─────────────────────────────────────────────────────────────
# 0. Imports
# ─────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix,
    accuracy_score, precision_score, recall_score, f1_score
)
from sklearn.model_selection import train_test_split
from google.colab import drive
from tqdm import tqdm

# ─────────────────────────────────────────────────────────────
# 1. Mount Google Drive
# ─────────────────────────────────────────────────────────────
drive.mount('/content/drive')

# ─────────────────────────────────────────────────────────────
# 2. Load TON-IoT dataset
# ─────────────────────────────────────────────────────────────
data_path = "/content/drive/MyDrive/Research Project/train_test_network.csv"
df = pd.read_csv(data_path)
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

print(f"Dataset shape: {df.shape}")
print("Columns:", list(df.columns)[:10], "...")

# ─────────────────────────────────────────────────────────────
# 3. Label conversion (binary: normal=0, attack=1)
# ─────────────────────────────────────────────────────────────
if "label" in df.columns:
    df["label"] = (df["label"] != 0).astype(int)
elif "type" in df.columns:
    df["label"] = df["type"].apply(lambda x: 0 if str(x).lower() == "normal" else 1)
else:
    raise ValueError("Expected 'label' or 'type' column in dataset")

print("Label distribution:\n", df["label"].value_counts())

# ─────────────────────────────────────────────────────────────
# 4. Stratified 80/20 train-test split
# ─────────────────────────────────────────────────────────────
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)
print(f"Train: {len(train_df)} | Test: {len(test_df)}")

# ─────────────────────────────────────────────────────────────
# 5. Basic feature engineering
# ─────────────────────────────────────────────────────────────
for d in [train_df, test_df]:
    if "src_bytes" in d.columns and "dst_bytes" in d.columns:
        d["bytes_ratio"]   = d["src_bytes"] / (d["dst_bytes"] + 1)
        d["log_src_bytes"] = np.log1p(d["src_bytes"])
        d["log_dst_bytes"] = np.log1p(d["dst_bytes"])
    if "duration" in d.columns:
        d["log_dur"] = np.log1p(d["duration"])
    if "src_pkts" in d.columns and "dst_pkts" in d.columns:
        d["pkt_ratio"] = d["src_pkts"] / (d["dst_pkts"] + 1)

# ─────────────────────────────────────────────────────────────
# 6. Encode categorical features
# ─────────────────────────────────────────────────────────────
categorical_cols = [c for c in ["proto", "service"] if c in train_df.columns]
for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col].astype(str), test_df[col].astype(str)], axis=0)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col]  = le.transform(test_df[col].astype(str))

# ─────────────────────────────────────────────────────────────
# 7. Select numeric features
# ─────────────────────────────────────────────────────────────
drop_cols = ["label", "type", "src_ip", "dst_ip", "date_time"]
num_cols = [c for c in train_df.columns if c not in drop_cols and pd.api.types.is_numeric_dtype(train_df[c])]

print(f"Using {len(num_cols)} numeric features")

X_train = train_df[num_cols].values
X_test  = test_df[num_cols].values
y_train = train_df["label"].values
y_test  = test_df["label"].values

# ─────────────────────────────────────────────────────────────
# 8. Normalize features
# ─────────────────────────────────────────────────────────────
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# ─────────────────────────────────────────────────────────────
# 9. Train Random Forest baseline
# ─────────────────────────────────────────────────────────────
print("\nTraining Random Forestes baseline model...")
clf = RandomForestClassifier(
    n_estimators=200,        # number of trees
    max_depth=None,          # let trees grow fully
    min_samples_leaf=5,      # avoid overfitting
    n_jobs=-1,               # parallelize
    random_state=42
)
clf.fit(X_train, y_train)

# ─────────────────────────────────────────────────────────────
# 10. Evaluate
# ─────────────────────────────────────────────────────────────
y_pred = clf.predict(X_test)

print("\n Baseline Random Forest Performance (TON-IoT):")
print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["normal", "attack"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Mounted at /content/drive
Dataset shape: (211043, 44)
Columns: ['src_ip', 'src_port', 'dst_ip', 'dst_port', 'proto', 'service', 'duration', 'src_bytes', 'dst_bytes', 'conn_state'] ...
Label distribution:
 label
1    161043
0     50000
Name: count, dtype: int64
Train: 168834 | Test: 42209
Using 23 numeric features

Training Random Forestes baseline model...

 Baseline Random Forest Performance (TON-IoT):
Accuracy : 0.998
Precision: 0.999
Recall   : 0.999
F1 Score : 0.999

Classification Report:
               precision    recall  f1-score   support

      normal       1.00      1.00      1.00     10000
      attack       1.00      1.00      1.00     32209

    accuracy                           1.00     42209
   macro avg       1.00      1.00      1.00     42209
weighted avg       1.00      1.00      1.00     42209

Confusion Matrix:
 [[ 9958    42]
 [   37 32172]]


In [2]:
from sklearn.metrics import roc_auc_score

def evaluate_model(clf, X_test, y_test):
    import time
    start = time.perf_counter()
    y_pred = clf.predict(X_test)
    end = time.perf_counter()
    cls_latency = (end - start) / len(y_test) * 1000  # ms per sample

    y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf, "predict_proba") else None

    # Confusion matrix + FPR
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    results = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        "FPR": fpr,
        "ConfusionMatrix": cm.tolist(),
        "Latency_cls_ms": cls_latency
    }
    return results

In [3]:
# ──────────────────────────────────────────────────────────────────────────────
# 6) Save to file (consistent with your NSL-KDD structure)
# ──────────────────────────────────────────────────────────────────────────────
import json
results = evaluate_model(clf, X_test, y_test)
with open("/content/drive/MyDrive/Results/TON-IoT/Random Forest/Baseline_ExtraTree_Results.txt", "w") as f:
    json.dump(results, f, indent=2)