# ==========================================
# 1. Imports & Configuration
# ==========================================

In [1]:

from __future__ import annotations

import os
import pickle
import time
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import shutil


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Attempt to import XGBoost (Optional high-performance model)
try:
    import xgboost as xgb
    from xgboost import XGBClassifier
except ImportError:
    print("[!] XGBoost not installed. Using Random Forest only.")
    XGBClassifier = None

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc
)


# --- Configuration ---


In [2]:
def get_project_root() -> Path:
    """
    Robustly find project root whether notebook is run from /notebooks or repo root.
    Looks for a folder reminding a repo structure: 'data' and 'src'.
    """
    cwd = Path.cwd().resolve()

    # candidate roots: cwd and cwd.parent (covers running from repo root or from notebooks/)
    candidates = [cwd, cwd.parent]

    for root in candidates:
        if (root / "data").exists() and (root / "src").exists():
            return root

    # fallback: assume notebooks/ and go one up
    return cwd.parent

PROJECT_ROOT = get_project_root()
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
MODELS_DIR = PROJECT_ROOT / "results" / "models"
FIGURES_DIR = PROJECT_ROOT / "results" / "figures"
METRICS_DIR = PROJECT_ROOT / "results" / "metrics"


# Create directories if they don't exist
MODELS_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
METRICS_DIR.mkdir(parents=True, exist_ok=True)

# Plotting Style (Academic Paper Standard)
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.2)

def save_plot(filename: str) -> None:
    path = FIGURES_DIR / f"{filename}.png"
    plt.tight_layout()
    plt.savefig(path, dpi=300, bbox_inches="tight")
    plt.close()  # ×—×©×•×‘ ×œ×”×¨×¦×•×ª ×—×•×–×¨×•×ª
    print(f"[+] Plot saved: {path}")



# ==========================================
# 2. Load Preprocessed Data
# ==========================================

In [3]:

print("[1] Loading preprocessed data artifacts...")

try:
    # Load Feature Matrices
    X_train = np.load(PROCESSED_DIR / "X_train.npy")
    X_test = np.load(PROCESSED_DIR / "X_test.npy")
    
    # Load Labels
    y_train_binary = np.load(PROCESSED_DIR / "y_train_binary.npy")
    y_test_binary = np.load(PROCESSED_DIR / "y_test_binary.npy")
    y_train_multi = np.load(PROCESSED_DIR / "y_train_multi.npy")
    y_test_multi = np.load(PROCESSED_DIR / "y_test_multi.npy")
    
    # Load Metadata (Feature names & Label Encoders for the Dashboard)
    feat_path = PROCESSED_DIR / "feature_names.csv"
    if feat_path.exists():
        feature_names = pd.read_csv(feat_path)["feature"].tolist()
    else:
        feature_names = [f"feat_{i}" for i in range(X_train.shape[1])]
        
    with open(PROCESSED_DIR / "label_encoder.pkl", "rb") as f:
        le = pickle.load(f)
        
    print(f"[+] Data loaded successfully.")
    print(f"    X_train shape: {X_train.shape}")
    print(f"    Target Classes (Multi): {le.classes_}")
    
except FileNotFoundError:
    raise FileNotFoundError(f"Missing data files in {PROCESSED_DIR}. Run 02_preprocessing.ipynb first.")

[1] Loading preprocessed data artifacts...
[+] Data loaded successfully.
    X_train shape: (125973, 122)
    Target Classes (Multi): ['DoS' 'Normal' 'Probe' 'R2L' 'U2R']



# ==========================================
# 3. Evaluation Helper Function
# ==========================================

In [4]:
def evaluate_model(model, X_test, y_test, model_name, is_binary=True):
    """
    Evaluates a model and generates standard academic plots.
    """
    print(f"\n--- Evaluating: {model_name} ---")
    start_time = time.time()
    
    # 1. Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if (is_binary and hasattr(model, "predict_proba")) else None
    
    inference_time = time.time() - start_time
    
    # 2. Metrics
    if is_binary:
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        print(f"Accuracy:  {acc:.4f}")
        print(f"Precision: {prec:.4f}")
        print(f"Recall:    {rec:.4f}")
        print(f"F1 Score:  {f1:.4f}")
    else:
        # Multi-class metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        print(f"Accuracy:  {acc:.4f}")
        print(f"Weighted F1: {f1:.4f}")
        print(classification_report(y_test, y_pred, target_names=le.classes_))

    # 3. Confusion Matrix Plot
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_test, y_pred)
    
    labels = ["Normal", "Attack"] if is_binary else le.classes_
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    save_plot(f"cm_{model_name.replace(' ', '_').lower()}")
    plt.show()
    
    # 4. ROC Curve (Binary Only)
    if is_binary and y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        
        plt.figure(figsize=(6, 5))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.3f}')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend(loc="lower right")
        save_plot(f"roc_{model_name.replace(' ', '_').lower()}")
        plt.show()

    return {"Model": model_name, "F1": f1, "Time": inference_time}

# Store results
results_log = []


# ==========================================
# 4. Training Binary Models (Detection)
# ==========================================

In [5]:

# Objective: Fast & Accurate detection (Is it an attack?)

# --- Model A: Logistic Regression (Baseline) ---
print("\n[2] Training Logistic Regression (Baseline)...")
lr_model = LogisticRegression(
    max_iter=2000,
    random_state=42,
    solver="lbfgs",
    class_weight="balanced"
)
lr_model.fit(X_train, y_train_binary)
results_log.append(evaluate_model(lr_model, X_test, y_test_binary, "Logistic Regression"))

# --- Model B: Random Forest (The Workhorse) ---
print("\n[3] Training Random Forest (Binary)...")
rf_binary = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample"
)
rf_binary.fit(X_train, y_train_binary)
results_log.append(evaluate_model(rf_binary, X_test, y_test_binary, "Random Forest Binary"))

# --- Model C: XGBoost (If available) ---
if XGBClassifier:
    print("\n[4] Training XGBoost (Binary)...")
    xgb_model = XGBClassifier(
        n_estimators=100, learning_rate=0.1, max_depth=6,
        use_label_encoder=False, eval_metric='logloss',
        random_state=42, n_jobs=-1
    )
    xgb_model.fit(X_train, y_train_binary)
    results_log.append(evaluate_model(xgb_model, X_test, y_test_binary, "XGBoost Binary"))
else:
    xgb_model = None


[2] Training Logistic Regression (Baseline)...

--- Evaluating: Logistic Regression ---
Accuracy:  0.4308
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\cm_logistic_regression.png
[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\roc_logistic_regression.png

[3] Training Random Forest (Binary)...

--- Evaluating: Random Forest Binary ---
Accuracy:  0.4312
Precision: 1.0000
Recall:    0.0008
F1 Score:  0.0016
[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\cm_random_forest_binary.png
[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\roc_random_forest_binary.png

[4] Training XGBoost (Binary)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Evaluating: XGBoost Binary ---
Accuracy:  0.4308
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\cm_xgboost_binary.png
[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\roc_xgboost_binary.png



# ==========================================
# 5. Training Multi-class Model (Classification)
# ==========================================

In [6]:
# Objective: Provide description for the Dashboard (e.g., "DoS Attack")
print("\n[5] Training Multi-class Classifier (For Dashboard Details)...")

rf_multi = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_multi.fit(X_train, y_train_multi)

# Evaluate Multi-class
evaluate_model(rf_multi, X_test, y_test_multi, "Random Forest Multi-class", is_binary=False)


[5] Training Multi-class Classifier (For Dashboard Details)...

--- Evaluating: Random Forest Multi-class ---
Accuracy:  0.6496
Weighted F1: 0.5943


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         DoS       1.00      0.57      0.73      7460
      Normal       0.55      1.00      0.71      9711
       Probe       0.99      0.29      0.45      2421
         R2L       0.00      0.00      0.00      2885
         U2R       0.00      0.00      0.00        67

    accuracy                           0.65     22544
   macro avg       0.51      0.37      0.38     22544
weighted avg       0.67      0.65      0.59     22544

[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\cm_random_forest_multi-class.png


{'Model': 'Random Forest Multi-class',
 'F1': 0.5942570994341888,
 'Time': 0.16780328750610352}


# ==========================================
# 6. Feature Importance (Interpretability)
# ==========================================

In [7]:
print("\n[6] Feature Importance Analysis...")
# Used for the report to explain WHAT the model is looking at
importances = rf_binary.feature_importances_
indices = np.argsort(importances)[::-1]
top_n = 20

plt.figure(figsize=(12, 6))
plt.title(f"Top {top_n} Features Driving Detection")
plt.bar(range(top_n), importances[indices[:top_n]], align="center", color="#2c3e50")
plt.xticks(range(top_n), [feature_names[i] for i in indices[:top_n]], rotation=45, ha='right')
plt.tight_layout()
save_plot("feature_importance")
plt.show()
plt.savefig("feature_importance.png")


[6] Feature Importance Analysis...
[+] Plot saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\figures\feature_importance.png


<Figure size 640x480 with 0 Axes>

# ==========================================
# 7. Comparison & Selection
# ==========================================

In [8]:
df_res = pd.DataFrame(results_log).set_index("Model")
print("\nModel Comparison Table:")
print(df_res)

# save to disk for report
metrics_path = METRICS_DIR / "model_comparison.csv"
df_res.to_csv(metrics_path)
print(f"[+] Metrics table saved: {metrics_path}")

# Select best binary model for production (usually RF or XGB)
best_model = xgb_model if xgb_model else rf_binary
print(f"\n[!] Selected Best Model for Production: {type(best_model).__name__}")


Model Comparison Table:
                            F1      Time
Model                                   
Logistic Regression   0.000000  0.009052
Random Forest Binary  0.001557  0.479590
XGBoost Binary        0.000000  0.075537
[+] Metrics table saved: C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\metrics\model_comparison.csv

[!] Selected Best Model for Production: XGBClassifier


# ==========================================
# 8. Save Models for Dashboard
# ==========================================

In [9]:
print("\n[7] Saving Models for Real-time Dashboard...")

# We need to save:
# 1. The Best Binary Model (For "Alert / No Alert")
# 2. The Multi-class Model (For "What kind of attack is this?")

with open(MODELS_DIR / "binary_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open(MODELS_DIR / "multi_model.pkl", "wb") as f:
    pickle.dump(rf_multi, f)

# Copy preprocessing artifacts for dashboard convenience
src_preprocessor = PROCESSED_DIR / "preprocessor.pkl"
src_le = PROCESSED_DIR / "label_encoder.pkl"
src_feat = PROCESSED_DIR / "feature_names.csv"

if src_preprocessor.exists():
    shutil.copy2(src_preprocessor, MODELS_DIR / "preprocessor.pkl")
if src_le.exists():
    shutil.copy2(src_le, MODELS_DIR / "label_encoder.pkl")
if src_feat.exists():
    shutil.copy2(src_feat, MODELS_DIR / "feature_names.csv")

print("[+] Copied preprocessing artifacts to results/models/")



print(f"[+] Models saved to {MODELS_DIR}")
print("    - binary_model.pkl (Detection)")
print("    - multi_model.pkl  (Classification)")


[7] Saving Models for Real-time Dashboard...
[+] Copied preprocessing artifacts to results/models/
[+] Models saved to C:\Users\elair\Desktop\CS\cyber_bootcamp\Network-Traffic-Anomaly-Detection\results\models
    - binary_model.pkl (Detection)
    - multi_model.pkl  (Classification)


# ==========================================
# 9. SIMULATION: Real-Time Dashboard Logic
# ==========================================

In [10]:
print("\n[8] Simulating Real-Time Dashboard Logic...")
print("This code snippet demonstrates how the dashboard will process a new packet.")

# Simulate a single incoming packet (taken from Test set)
random_idx = np.random.randint(0, len(X_test))
sample_packet = X_test[random_idx].reshape(1, -1)
true_label_code = y_test_multi[random_idx]
true_label_str = le.inverse_transform([true_label_code])[0]

print(f"\n--- Incoming Packet (ID: {random_idx}) ---")
print(f"True Label: {true_label_str}")

# --- DASHBOARD LOGIC START ---
# 1. Load Preprocessor & Models (Already done above, but conceptually:)
# preprocessor = pickle.load(...)
# binary_model = pickle.load(...)
# multi_model = pickle.load(...)

# 2. Detect Anomaly
is_attack = best_model.predict(sample_packet)[0] # 0 or 1
attack_prob = best_model.predict_proba(sample_packet)[0][1] # 0.0 to 1.0

if is_attack == 1:
    # 3. If Attack -> Classify Type
    attack_code = rf_multi.predict(sample_packet)[0]
    attack_type = le.inverse_transform([attack_code])[0]
    
    print(f"ðŸš¨ ALERT TRIGGERED!")
    print(f"   Confidence: {attack_prob:.2%}")
    print(f"   Attack Type: {attack_type}")
    print(f"   Action: Block IP & Log Incident.")
else:
    print(f"âœ… Traffic Normal (Confidence: {1-attack_prob:.2%})")
# --- DASHBOARD LOGIC END ---


[8] Simulating Real-Time Dashboard Logic...
This code snippet demonstrates how the dashboard will process a new packet.

--- Incoming Packet (ID: 16661) ---
True Label: Normal
âœ… Traffic Normal (Confidence: 100.00%)
