**1– Paths and Configuration**

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import (
    roc_auc_score, roc_curve, precision_recall_curve, auc,
    confusion_matrix, classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)
from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

# Project paths
BASE_DIR = "Anomaly_Detection_Pipeline"
DATA_DIR = os.path.join(BASE_DIR, "data", "processed")
MODEL_DIR = os.path.join(BASE_DIR, "models")

PUBLIC_BENIGN_FILE = os.path.join(DATA_DIR, "public_benign_set.csv")
PRIVATE_BENIGN_FILE = os.path.join(DATA_DIR, "private_benign_set.csv")
ATTACK_FILE = os.path.join(DATA_DIR, "attack_set.csv")

BEST_MODEL_PATH = os.path.join(MODEL_DIR, "best_autoencoder.pt")
SCHEMA_PATH = os.path.join(MODEL_DIR, "schema.pkl")
BOUNDS_PATH = os.path.join(MODEL_DIR, "bounds.pkl")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


**2– Load Model, Schema, and Bounds**

In [None]:
with open(SCHEMA_PATH, "rb") as f:
    schema = pickle.load(f)
with open(BOUNDS_PATH, "rb") as f:
    bounds = pickle.load(f)

cat_feature = schema["cat_feature"]
cont_cols = schema["cont_cols"]
minmax_features = schema["minmax_features"]
no_scale_features = schema["no_scale_features"]

class DeepAutoencoder(nn.Module):
    def __init__(self, n_cont, max_ports=65535, emb_dim=128, latent_dim=128):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings=max_ports + 1, embedding_dim=emb_dim)
        in_dim = n_cont + emb_dim
        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 1024), nn.ReLU(),
            nn.Linear(1024, 512), nn.ReLU(),
            nn.Linear(512, 256), nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256), nn.ReLU(),
            nn.Linear(256, 512), nn.ReLU(),
            nn.Linear(512, 1024), nn.ReLU(),
            nn.Linear(1024, n_cont)
        )
    def forward(self, cat, cont):
        emb = self.emb(cat)
        x = torch.cat([emb, cont], dim=1)
        z = self.encoder(x)
        return self.decoder(z)

model = DeepAutoencoder(len(cont_cols), max_ports=65535, emb_dim=128, latent_dim=128).to(device)
model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))
model.eval()

print("Model and schema successfully loaded.")


**3– Load Processed Datasets**

In [None]:
df_public = pd.read_csv(PUBLIC_BENIGN_FILE)
df_attack = pd.read_csv(ATTACK_FILE)
df_private = pd.read_csv(PRIVATE_BENIGN_FILE)

print(f"Public Benign: {df_public.shape}, Attack: {df_attack.shape}, Private Benign: {df_private.shape}")


**4– Helper Functions for Preprocessing**

In [None]:
def log_safe(x):
    return np.log1p(np.clip(x, a_min=0, a_max=None))

def apply_log_scaling(df, features):
    df_scaled = df.copy()
    for col in features:
        if col in df_scaled.columns:
            df_scaled[col] = log_safe(df_scaled[col].values)
    return df_scaled

def robust_minmax_transform(df, bounds, exclude_cols=None):
    df_scaled = df.copy()
    for col, (lower, upper) in bounds.items():
        if col not in df_scaled.columns:
            continue
        if exclude_cols and col in exclude_cols:
            df_scaled[col] = 0.0
            continue
        if upper == lower or (upper - lower) < 1e-9:
            df_scaled[col] = 0.0
        else:
            x = np.clip(df_scaled[col].values, lower, upper)
            df_scaled[col] = (x - lower) / (upper - lower)
    return df_scaled.fillna(0.0)

# Constant columns in the private dataset
constant_cols = [
    "min_seg_size_forward", "Flow IAT Min", "Fwd URG Flags", "URG Flag Count",
    "ECE Flag Count", "Active Max", "Active Min", "Active Mean", "Active Std",
    "Idle Max", "Idle Min", "Idle Mean", "Idle Std", "CWE Flag Count"
]


**5– Function to Compute MSE Errors**

In [None]:
def compute_mse_errors(df, name="dataset"):
    df_scaled = apply_log_scaling(df, minmax_features)
    for col in no_scale_features:
        if col in df.columns:
            df_scaled[col] = df[col].astype(np.float32)
    df_scaled[cat_feature] = df[cat_feature].astype(np.int64)
    df_scaled = robust_minmax_transform(df_scaled, bounds, exclude_cols=constant_cols)
    for c in cont_cols:
        if c not in df_scaled.columns:
            df_scaled[c] = 0.0
    df_scaled = df_scaled[[cat_feature] + cont_cols]

    cat_tensor = torch.tensor(df_scaled[cat_feature].values, dtype=torch.long)
    cont_tensor = torch.tensor(df_scaled[cont_cols].values, dtype=torch.float32)
    loader = DataLoader(TensorDataset(cat_tensor, cont_tensor), batch_size=1024, shuffle=False)

    mse_list = []
    with torch.no_grad():
        for cat, cont in loader:
            cat, cont = cat.to(device), cont.to(device)
            output = model(cat, cont)
            mse_batch = torch.mean((output - cont) ** 2, dim=1).cpu().numpy()
            mse_list.extend(mse_batch)

    mse_arr = np.array(mse_list)
    print(f"Evaluated {len(mse_arr)} samples from {name}.")
    return mse_arr


**6– Evaluate on Attack Dataset**

In [None]:
df_attack["Label_Binary"] = (df_attack["Label"] != "BENIGN").astype(int)
df_mixed = pd.concat([df_public.assign(Label_Binary=0), df_attack])

cat_tensor = torch.tensor(df_mixed[cat_feature].astype(int).values, dtype=torch.long)
cont_tensor = torch.tensor(df_mixed[cont_cols].astype(np.float32).values, dtype=torch.float32)
labels_tensor = torch.tensor(df_mixed["Label_Binary"].values, dtype=torch.int)

mixed_loader = DataLoader(TensorDataset(cat_tensor, cont_tensor, labels_tensor), batch_size=1024, shuffle=False)

recon_errors, true_labels = [], []
with torch.no_grad():
    for cat, cont, labels in mixed_loader:
        cat, cont = cat.to(device), cont.to(device)
        output = model(cat, cont)
        batch_err = torch.mean((output - cont) ** 2, dim=1).cpu().numpy()
        recon_errors.extend(batch_err)
        true_labels.extend(labels.cpu().numpy())

recon_errors = np.array(recon_errors)
true_labels = np.array(true_labels)
print(f"Inference complete on {len(recon_errors)} samples.")

roc_auc = roc_auc_score(true_labels, recon_errors)
precision, recall, thresholds_pr = precision_recall_curve(true_labels, recon_errors)
pr_auc = auc(recall, precision)
print(f"ROC AUC: {roc_auc:.4f} | PR AUC: {pr_auc:.4f}")


**7– Visualizations for Attack vs Benign**

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(recon_errors[true_labels==0], bins=100, color='green', alpha=0.6, label='Benign')
sns.histplot(recon_errors[true_labels==1], bins=100, color='red', alpha=0.5, label='Attack')
plt.yscale('log')
plt.xlabel("Reconstruction Error (MSE)")
plt.ylabel("Frequency (log scale)")
plt.title("Reconstruction Error Distribution")
plt.legend()
plt.show()

fpr, tpr, _ = roc_curve(true_labels, recon_errors)
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, color='blue', label=f'ROC (AUC = {roc_auc:.4f})')
plt.plot([0,1], [0,1], '--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

plt.figure(figsize=(6,6))
plt.plot(recall, precision, color='purple', label=f'PR (AUC = {pr_auc:.4f})')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve")
plt.legend()
plt.show()


**8– Compare Public vs Private Benign MSEs**

In [None]:
mse_public = compute_mse_errors(df_public, name="public benign")
mse_private = compute_mse_errors(df_private, name="private benign")

summary = pd.DataFrame({
    "Dataset": ["Public", "Private"],
    "Mean": [np.mean(mse_public), np.mean(mse_private)],
    "Median": [np.median(mse_public), np.median(mse_private)],
    "Std": [np.std(mse_public), np.std(mse_private)],
    "Min": [np.min(mse_public), np.min(mse_private)],
    "Max": [np.max(mse_public), np.max(mse_private)]
})
display(summary)

plt.figure(figsize=(9,5))
sns.kdeplot(mse_public, color='blue', fill=True, label='Public Benign')
sns.kdeplot(mse_private, color='orange', fill=True, label='Private Benign')
plt.xscale('log')
plt.xlabel("Reconstruction Error (MSE)")
plt.ylabel("Density (log scale)")
plt.title("Reconstruction Error Comparison – Public vs Private Benign")
plt.legend()
plt.show()


**9– Statistical Divergence Analysis**

In [None]:
ks_stat, ks_p = ks_2samp(mse_public, mse_private)
print(f"KS Statistic: {ks_stat:.4f}, p-value: {ks_p:.4e}")

bins = np.logspace(
    np.log10(min(mse_public.min(), mse_private.min()) + 1e-9),
    np.log10(max(mse_public.max(), mse_private.max()) + 1e-9),
    200
)
hist_pub, _ = np.histogram(mse_public, bins=bins, density=True)
hist_priv, _ = np.histogram(mse_private, bins=bins, density=True)

hist_pub /= np.sum(hist_pub)
hist_priv /= np.sum(hist_priv)
overlap_area = np.sum(np.minimum(hist_pub, hist_priv))

print(f"Overlap Area: {overlap_area:.4f}")
if overlap_area < 0.5:
    print("Strong distributional divergence detected.")
elif overlap_area < 0.8:
    print("Moderate divergence detected.")
else:
    print("High similarity detected between distributions.")

plt.figure(figsize=(9,5))
sns.histplot(mse_public, bins=bins, color='blue', alpha=0.5, label='Public', stat='density')
sns.histplot(mse_private, bins=bins, color='orange', alpha=0.5, label='Private', stat='density')
plt.xscale('log')
plt.yscale('log')
plt.xlabel("Reconstruction Error (MSE)")
plt.ylabel("Density (log scale)")
plt.title("MSE Distribution Comparison (Log-Log Scale)")
plt.legend()
plt.show()

cdf_pub = np.cumsum(hist_pub) / np.sum(hist_pub)
cdf_priv = np.cumsum(hist_priv) / np.sum(hist_priv)
plt.figure(figsize=(8,4))
plt.plot(bins[:-1], np.abs(cdf_pub - cdf_priv), color='red', lw=2)
plt.xscale('log')
plt.xlabel("Reconstruction Error (MSE)")
plt.ylabel("|ΔCDF|")
plt.title("Absolute CDF Difference – Public vs Private Benign Sets")
plt.grid(True, which="both", ls="--", lw=0.5)
plt.show()
