
# ==========================================
# 1. Imports & Configuration
# ==========================================

In [20]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import json
import joblib
import time

import numpy as np
import pandas as pd

from utils_transformers import safe_log1p



# ==========================================
# 2. Paths & Artifacts
# ==========================================

In [21]:


PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
RESULTS_DIR   = PROJECT_ROOT / "results"
MODELS_DIR    = RESULTS_DIR / "models"

print("[✓] Project Root:", PROJECT_ROOT)
print("[✓] Processed:", PROCESSED_DIR)
print("[✓] Models:", MODELS_DIR)




[✓] Project Root: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection
[✓] Processed: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\data\processed
[✓] Models: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\models



# ==========================================
# 3. Load Models & Metadata
# ==========================================

In [22]:
print("\n[1] Loading trained models and metadata...")

binary_model = joblib.load(MODELS_DIR / "binary_model.pkl")

with open(MODELS_DIR / "model_metadata.json", "r") as f:
    metadata = json.load(f)

BINARY_THRESHOLD = metadata["binary_threshold"]

print(f"[✓] Binary threshold loaded: {BINARY_THRESHOLD:.4f}")



[1] Loading trained models and metadata...
[✓] Binary threshold loaded: 0.0010


# ==========================================
# 4. Load Data for Inference (Test Set)
# ==========================================

In [23]:
print("\n[2] Loading processed test data...")

test_df = pd.read_csv(PROCESSED_DIR / "test_cleaned.csv")

DROP_COLS = [
    "label", "attack_class", "binary_target",
    "attack_category", "level", "id", "difficulty"
]

X_test = test_df.drop(columns=DROP_COLS, errors="ignore")

print(f"[✓] Test samples loaded: {len(X_test)}")



[2] Loading processed test data...
[✓] Test samples loaded: 22544


# ==========================================
# 5. Probability Inference
# ==========================================

In [24]:
def predict_proba(pipe, X_raw):
    prep  = pipe.named_steps["prep"]
    model = pipe.named_steps["model"]

    X_enc = prep.transform(X_raw)

    best_iter = getattr(model, "best_iteration", None)
    if best_iter is None:
        return model.predict_proba(X_enc)[:, 1]

    return model.predict_proba(
        X_enc, iteration_range=(0, best_iter + 1)
    )[:, 1]


print("\n[3] Running inference...")

attack_probs = predict_proba(binary_model, X_test)



[3] Running inference...


# ==========================================
# 6. Alert Decision Logic
# ==========================================

In [25]:
alerts_df = pd.DataFrame({
    "sample_id": np.arange(len(X_test)),
    "attack_probability": attack_probs,
})

alerts_df["decision"] = (alerts_df["attack_probability"] >= BINARY_THRESHOLD).astype(int)
alerts_df["decision_label"] = alerts_df["decision"].map({0: "NORMAL", 1: "ALERT"})

# Add simulated timestamps (for SOC realism)
base_time = int(time.time())
alerts_df["timestamp"] = [
    base_time + i for i in range(len(alerts_df))
]

alerts_df = alerts_df[
    ["timestamp", "sample_id", "attack_probability", "decision_label"]
]

print(alerts_df.head())


    timestamp  sample_id  attack_probability decision_label
0  1766758368          0            0.999995          ALERT
1  1766758369          1            0.999995          ALERT
2  1766758370          2            0.001147          ALERT
3  1766758371          3            0.998581          ALERT
4  1766758372          4            0.127244          ALERT


# ==========================================
# 7. Alert Statistics
# ==========================================

In [26]:
print("\n[4] Alert statistics:")

alert_rate = (alerts_df["decision_label"] == "ALERT").mean()

print(f"    Total samples: {len(alerts_df)}")
print(f"    ALERT rate:    {alert_rate:.2%}")
print(f"    Threshold:     {BINARY_THRESHOLD:.4f}")



[4] Alert statistics:
    Total samples: 22544
    ALERT rate:    61.09%
    Threshold:     0.0010


# ==========================================
# 8. Export Alerts
# ==========================================

In [27]:
alerts_path = RESULTS_DIR / "alerts_simulation.csv"
alerts_df.to_csv(alerts_path, index=False)

print(f"\n[✓] Alerts exported to: {alerts_path}")



[✓] Alerts exported to: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\alerts_simulation.csv
