**1 – Imports and Paths**

In [None]:
# ===============================
# STEP 1 – Imports and Directory Setup
# ===============================
import os
import gc
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
import matplotlib.pyplot as plt

# Project paths
BASE_DIR = "Anomaly_Detection_Pipeline"
DATA_DIR = os.path.join(BASE_DIR, "data", "processed")

PUBLIC_BENIGN_FILE = os.path.join(DATA_DIR, "public_benign_set.csv")
PRIVATE_BENIGN_FILE = os.path.join(DATA_DIR, "private_benign_set.csv")
ATTACK_FILE = os.path.join(DATA_DIR, "attack_set.csv")

MODEL_DIR = "Anomaly_Detection_Pipeline/models"
os.makedirs(MODEL_DIR, exist_ok=True)

# -------------------------------
# Load preprocessed data
# -------------------------------
df = pd.read_csv(PUBLIC_BENIGN_FILE)
attack_df = pd.read_csv(ATTACK_FILE)

# Reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


**2 – Feature Groups**

In [None]:
# ===============================
# STEP 2 – Feature Groups (auto-filtered, constant columns removed)
# ===============================
cat_feature = "Destination Port"

# Original no-scale candidates — now excluding the 8 constant columns
candidate_no_scale = [
    "Fwd PSH Flags", "URG Flag Count",
    "ECE Flag Count", "Active Max", "Active Min",
    "Active Mean", "Active Std", "Idle Max",
    "Idle Min", "Idle Mean", "Idle Std", "CWE Flag Count"
]

# Keep only those that exist in the dataset
no_scale_features = [f for f in candidate_no_scale if f in df.columns]

missing = [f for f in [cat_feature] if f not in df.columns]
if missing:
    raise ValueError(f"Missing required categorical feature(s): {missing}")

# Continuous features for scaling
minmax_features = [
    c for c in df.columns
    if c not in no_scale_features + [cat_feature, "Label"]
]

print(f" Dataset shape: {df.shape}")
print(f" Robust MinMax features: {len(minmax_features)}")
print(f" No-scaling features: {len(no_scale_features)}")
print(f" Categorical feature: {cat_feature}\n")


***3 – Port Encoding***

In [None]:
# ===============================
# STEP 3 – Universal Port Encoding
# ===============================
cat_feature = "Destination Port"
df[cat_feature] = df[cat_feature].astype(int)

n_ports = 65536  # 0–65535
print(f"Using universal port embedding space of size {n_ports}.")
print(f"Observed unique ports: {df[cat_feature].nunique()}")
print(f"Port range: [{df[cat_feature].min()}, {df[cat_feature].max()}]")


**4 – Data Splitting**

In [None]:
# ===============================
# STEP 4 – Train / Validation / Test Split
# ===============================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED, shuffle=True)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=SEED, shuffle=True)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")


**5 – Log Scaling**

In [None]:
# ===============================
# STEP 5 – Log Scaling for Skewed Features
# ===============================
def log_safe(x):
    return np.log1p(np.clip(x, a_min=0, a_max=None))

def apply_log_scaling(df, features):
    df_scaled = df.copy()
    for col in features:
        df_scaled[col] = log_safe(df[col].values)
    return df_scaled

train_scaled = apply_log_scaling(train_df, minmax_features)
val_scaled   = apply_log_scaling(val_df, minmax_features)
test_scaled  = apply_log_scaling(test_df, minmax_features)

for col in no_scale_features:
    if col in df.columns:
        train_scaled[col] = train_df[col].astype(np.float32)
        val_scaled[col]   = val_df[col].astype(np.float32)
        test_scaled[col]  = test_df[col].astype(np.float32)

train_scaled[cat_feature] = train_df[cat_feature].astype(np.int64)
val_scaled[cat_feature]   = val_df[cat_feature].astype(np.int64)
test_scaled[cat_feature]  = test_df[cat_feature].astype(np.int64)

print("Log-scaling applied to continuous features.")


**6 – Save Schema and Bounds**

In [None]:
# ===============================
# STEP 6 – Save Schema and Bounds
# ===============================
cont_cols = [c for c in train_scaled.columns if c != cat_feature]

bounds = {}
for col in minmax_features:
    if col in train_scaled.columns:
        lower = 0.0
        upper = train_scaled[col].quantile(0.999)
        bounds[col] = (lower, upper)

with open(os.path.join(MODEL_DIR, "schema.pkl"), "wb") as f:
    pickle.dump({
        "cat_feature": cat_feature,
        "cont_cols": cont_cols,
        "minmax_features": minmax_features,
        "no_scale_features": no_scale_features
    }, f)

with open(os.path.join(MODEL_DIR, "bounds.pkl"), "wb") as f:
    pickle.dump(bounds, f)

print(f"Saved schema ({len(cont_cols)} continuous cols) and bounds ({len(bounds)}).")


**7 – Convert to Torch Tensors**

In [None]:
# ===============================
# STEP 7 – Convert to Torch Tensors
# ===============================
def df_to_tensor(df_in):
    cat = torch.as_tensor(df_in[cat_feature].values, dtype=torch.long)
    cont = torch.as_tensor(df_in.drop(columns=[cat_feature]).values, dtype=torch.float32)
    return cat, cont

cat_train, cont_train = df_to_tensor(train_scaled)
cat_val, cont_val = df_to_tensor(val_scaled)
cat_test, cont_test = df_to_tensor(test_scaled)

train_ds = TensorDataset(cat_train, cont_train)
val_ds = TensorDataset(cat_val, cont_val)
test_ds = TensorDataset(cat_test, cont_test)

BATCH_SIZE = 4096 if device.type == "cuda" else 1024
train_loader = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=2)
val_loader = DataLoader(val_ds, shuffle=False, batch_size=BATCH_SIZE, num_workers=2)
test_loader = DataLoader(test_ds, shuffle=False, batch_size=BATCH_SIZE, num_workers=2)

print(f"Data prepared for training. Batch size: {BATCH_SIZE}")


**8 – Model Definition**

In [None]:
# ===============================
# STEP 8 – Deep Autoencoder Definition
# ===============================
class DeepAutoencoder(nn.Module):
    def __init__(self, n_cont, max_ports=65535, emb_dim=128, latent_dim=128):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings=max_ports + 1, embedding_dim=emb_dim)
        in_dim = n_cont + emb_dim

        self.encoder = nn.Sequential(
            nn.Linear(in_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, n_cont)
        )

    def forward(self, cat, cont):
        emb = self.emb(cat)
        x = torch.cat([emb, cont], dim=1)
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon

n_cont = len(df.columns) - 1
model = DeepAutoencoder(n_cont, max_ports=65535, emb_dim=128, latent_dim=128).to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
criterion = nn.MSELoss()
print("Model initialized successfully.")


**9 – Training and Visualization**

In [None]:
# ===============================
# STEP 9 – Training Loop with Early Stopping
# ===============================
EPOCHS = 255
PATIENCE = 20
best_val_loss = float("inf")
epochs_no_improve = 0
train_losses, val_losses, grad_norms = [], [], []

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_train_loss, total_grad_norm = 0, 0

    for cat, cont in train_loader:
        cat, cont = cat.to(device), cont.to(device)
        optimizer.zero_grad()
        output = model(cat, cont)
        loss = criterion(output, cont)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        total_grad_norm += grad_norm.item()
        optimizer.step()
        total_train_loss += loss.item() * cont.size(0)

    total_train_loss /= len(train_loader.dataset)
    avg_grad_norm = total_grad_norm / len(train_loader)
    train_losses.append(total_train_loss)
    grad_norms.append(avg_grad_norm)

    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for cat, cont in val_loader:
            cat, cont = cat.to(device), cont.to(device)
            output = model(cat, cont)
            total_val_loss += criterion(output, cont).item() * cont.size(0)
    total_val_loss /= len(val_loader.dataset)
    val_losses.append(total_val_loss)

    print(f"Epoch {epoch:03d}/{EPOCHS} | Train MSE: {total_train_loss:.6f} | Val MSE: {total_val_loss:.6f}")

    if total_val_loss < best_val_loss:
        best_val_loss = total_val_loss
        torch.save(model.state_dict(), os.path.join(MODEL_DIR, "best_autoencoder.pt"))
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print("Early stopping triggered.")
            break

print(f"Training complete. Best Val MSE: {best_val_loss:.6f}")

# --- Plot loss curves ---
plt.figure(figsize=(10,5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.title("Training and Validation Loss over Epochs")
plt.legend()
plt.grid(True)
plt.show()
