In [None]:
################## DATASET EXTRACTION AND WINDOWING   #########################

import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from collections import Counter, defaultdict
from sklearn.preprocessing import StandardScaler
import random

# === List of CSV paths ===
csv_paths = [
    "misreport_long_data-Copy/augmented_dataset_with_behavioral.csv",
    "misreport_long_data2/augmented_dataset_with_behavioral.csv",
    "misreport_long_data3/augmented_dataset_with_behavioral.csv",
    "misreport_long_data4/augmented_dataset_with_behavioral.csv"
]

# === Generate sliding windows per switch ===
def generate_windowed_rows(df, switch_ids, window_size=10, stride=5):
    rows_by_switch = defaultdict(list)
    for sw in switch_ids:
        feature_cols = [
            f"Load_{sw}",
            f"Load_{sw}_delta",
            f"Load_{sw}_rollmean",
            f"Load_{sw}_percentile",
            f"Load_{sw}_rolling_percentile",
            f"Load_{sw}_std_recent",
            f"Load_{sw}_load_ratio",
            f"Load_{sw}_delta_mean",
            f"Load_{sw}_mad",
            f"Load_{sw}_unique_count",
            f"Load_{sw}_autocorr",
            f"Load_{sw}_skew",
            f"Load_{sw}_kurtosis",
            f"Load_{sw}_zscore"
        ]
        label_col = f"Label_{sw}"

        if not all(col in df.columns for col in feature_cols + [label_col]):
            print(f"⚠️ Skipping switch {sw} — missing columns")
            continue

        for i in range(0, len(df) - window_size + 1, stride):
            window_feats = df[feature_cols].iloc[i:i+window_size].values.astype(np.float32)
            window_label = df[label_col].iloc[i:i+window_size].values
            label = int(np.any(window_label > 0.5))  # Binary label
            rows_by_switch[sw].append({
                "switch": sw,
                "features": window_feats,
                "label": label
            })
    return rows_by_switch

# === Label stats print ===
def print_label_distribution(windowed_rows, name=""):
    labels = [row["label"] for row in windowed_rows]
    counts = Counter(labels)
    print(f"\n📦 Label Distribution for {name}:")
    print(f"FAKE = {counts.get(1, 0)}, REAL = {counts.get(0, 0)}")

# === Accumulate and shuffle windows ===
combined_rows_by_switch = defaultdict(list)

for path in tqdm(csv_paths, desc="Processing CSVs"):
    df = pd.read_csv(path)
    print(f"\n✅ Loaded dataset: {path} — shape: {df.shape}")
    
    # Identify switch IDs
    load_cols = [col for col in df.columns if re.fullmatch(r"Load_S\d+", col)]
    switch_ids = [col.split("Load_")[1] for col in load_cols]
    print(f"✅ Found switch IDs: {switch_ids}")
    
    # Normalize features (excluding labels)
    label_cols = [col for col in df.columns if re.fullmatch(r"Label_S\d+", col)]
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols_to_scale = list(set(numeric_cols) - set(label_cols))
    scaler = StandardScaler()
    df[feature_cols_to_scale] = scaler.fit_transform(df[feature_cols_to_scale])
    print("✅ Standardized all numeric features")

    # Generate windowed data
    file_windows = generate_windowed_rows(df, switch_ids, window_size=10, stride=5)
    for sw, windows in file_windows.items():
        combined_rows_by_switch[sw].extend(windows)

# === Shuffle windows per switch ===
for sw in combined_rows_by_switch:
    random.shuffle(combined_rows_by_switch[sw])
print("✅ Shuffled windows within each switch")

# === Combine into single list ===
windowed_rows_10 = []
for sw_windows in combined_rows_by_switch.values():
    windowed_rows_10.extend(sw_windows)
print("✅ Combined all switches' windows into one list")

# === Final global shuffle ===
random.shuffle(windowed_rows_10)
print("✅ Globally shuffled the final windowed dataset")

print_label_distribution(windowed_rows_10, "Final Window10")


In [None]:
################## Hybrid Anomaly Detection with SHAP + MLP/LightGBM  ###################################





!pip install shap lightgbm
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.covariance import EmpiricalCovariance
from sklearn.manifold import TSNE
from sklearn.calibration import CalibratedClassifierCV
import matplotlib.pyplot as plt
import shap
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

################## 1. Prepare Data ##################
X = np.array([row["features"] for row in windowed_rows_10])
y = np.array([row["label"] for row in windowed_rows_10])

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
X_train_real = X_train[y_train == 0]


################## Classification report printing 5 decimal places  ############################

from sklearn.metrics import classification_report

def print_full_classification_report(y_true, y_pred, digits=5):
    report = classification_report(y_true, y_pred, output_dict=True)
    
    header = f"{'Label':<15} {'Precision':>{digits+7}} {'Recall':>{digits+7}} {'F1-Score':>{digits+7}} {'Support':>10}"
    print(header)
    print("=" * len(header))

    for label, metrics in report.items():
        if isinstance(metrics, dict):
            precision = f"{metrics['precision']:.{digits}f}"
            recall = f"{metrics['recall']:.{digits}f}"
            f1 = f"{metrics['f1-score']:.{digits}f}"
            support = f"{int(metrics['support']):>10}"
            print(f"{label:<15} {precision:>{digits+7}} {recall:>{digits+7}} {f1:>{digits+7}} {support}")
    
    # Accuracy is a float, not a dict
    acc = report['accuracy']
    acc_str = f"{acc:.{digits}f}"
    print("=" * len(header))
    print(f"{'Accuracy':<15} {'':>{digits+7}} {'':>{digits+7}} {acc_str:>{digits+7}}")

####################################################################################################

################## 2. Transformer Autoencoder ##################
class TransformerAutoencoder(nn.Module):
    def __init__(self, input_dim, seq_len, d_model=64, nhead=4, num_layers=2):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, seq_len, d_model) * 0.02)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dropout=0.1, batch_first=True, norm_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.Sequential(nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, input_dim))

    def forward(self, x):
        x_proj = self.input_proj(x) + self.pos_embedding
        encoded = self.encoder(x_proj)
        decoded = self.decoder(encoded)
        return decoded, encoded

def train_autoencoder(model, X_train, device, epochs=100):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.MSELoss()
    for epoch in range(epochs):
        batch_size = 256
        perm = np.random.permutation(len(X_train))
        losses = []
        for i in range(0, len(X_train), batch_size):
            idx = perm[i:i+batch_size]
            xb = torch.tensor(X_train[idx], dtype=torch.float32).to(device)
            decoded, _ = model(xb)
            loss = criterion(decoded, xb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f"Epoch {epoch+1}, Loss: {np.mean(losses):.4f}")

def score_model(model, X, device, batch_size=512):
    model.eval()
    recon_errors, latents = [], []
    with torch.no_grad():
        for i in range(0, len(X), batch_size):
            xb = torch.tensor(X[i:i+batch_size], dtype=torch.float32).to(device)
            decoded, encoded = model(xb)
            recon_error = ((decoded - xb) ** 2).mean(dim=(1, 2)).cpu().numpy()
            latent = encoded.mean(dim=1).cpu().numpy()
            recon_errors.extend(recon_error)
            latents.extend(latent)
    return np.array(recon_errors), np.array(latents)

################## 3. Statistical Baseline ##################
class StatisticalAutoencoder:
    def fit(self, X):
        self.mean = X.mean(axis=(0,1))
        self.std = X.std(axis=(0,1))
    def score(self, X):
        return (((X - self.mean) / (self.std + 1e-6))**2).mean(axis=(1,2))

################## 4. Train Autoencoder ##################
seq_len, input_dim = X.shape[1], X.shape[2]
model = TransformerAutoencoder(input_dim, seq_len).to(device)
train_autoencoder(model, X_train_real, device)

################## 5. Score ##################
recon_val, latent_val = score_model(model, X_val, device)
recon_test, latent_test = score_model(model, X_test, device)
latent_train = score_model(model, X_train_real, device)[1]

stat_model = StatisticalAutoencoder()
stat_model.fit(X_train_real)
stat_val = stat_model.score(X_val)
stat_test = stat_model.score(X_test)

emp_cov = EmpiricalCovariance().fit(latent_train)
mahal_val = emp_cov.mahalanobis(latent_val)
mahal_test = emp_cov.mahalanobis(latent_test)

################## 6. Fusion Features ##################
val_feats = np.stack([recon_val, stat_val, mahal_val], axis=1)
test_feats = np.stack([recon_test, stat_test, mahal_test], axis=1)

scaler_main = StandardScaler()
val_feats_scaled = scaler_main.fit_transform(val_feats)
test_feats_scaled = scaler_main.transform(test_feats)

################## 7. Shallow MLP Fusion ##################
mlp_model = MLPClassifier(hidden_layer_sizes=(16,), max_iter=500, random_state=42)
mlp_model.fit(val_feats_scaled, y_val)
mlp_preds = mlp_model.predict(test_feats_scaled)
mlp_probs = mlp_model.predict_proba(test_feats_scaled)[:, 1]

print("\n📊 MLP Final Test Report:")
print(classification_report(y_test, mlp_preds, target_names=["REAL", "FAKE"]))
print("MLP Test ROC AUC:", roc_auc_score(y_test, mlp_probs))
print_full_classification_report(y_test, mlp_preds)


################## 8. LightGBM + Calibration ##################
lgb_model = lgb.LGBMClassifier(n_estimators=100, max_depth=2)
calibrated_lgb = CalibratedClassifierCV(estimator=lgb_model, method='sigmoid', cv=5)
calibrated_lgb.fit(val_feats_scaled, y_val)
lgb_preds = calibrated_lgb.predict(test_feats_scaled)
lgb_probs = calibrated_lgb.predict_proba(test_feats_scaled)[:, 1]

print("\n📊 Calibrated LightGBM Final Test Report:")
print(classification_report(y_test, lgb_preds, target_names=["REAL", "FAKE"]))
print("LightGBM Test ROC AUC:", roc_auc_score(y_test, lgb_probs))
print_full_classification_report(y_test, lgb_preds)


################## 9. SHAP Analysis (corrected) ##################
# Use the fitted base estimator from the calibrated model
fitted_lgb = calibrated_lgb.calibrated_classifiers_[0].estimator  # Use .estimators_[0][0] if this fails

explainer = shap.TreeExplainer(fitted_lgb)
shap_values = explainer.shap_values(test_feats_scaled)
shap.summary_plot(shap_values, test_feats_scaled, feature_names=["recon", "stat", "mahal"], show=False)
#plt.savefig("plots/hybrid-shap_summary_plot.png", dpi=600, bbox_inches='tight')
plt.close()

################## 10. t-SNE Latent Visualization ##################
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
latent_2d = tsne.fit_transform(latent_test)

plt.figure(figsize=(8, 6))
plt.title("Latent space (TSNE projection)")
plt.scatter(latent_2d[y_test==0, 0], latent_2d[y_test==0, 1], c='blue', label="REAL", alpha=0.5)
plt.scatter(latent_2d[y_test==1, 0], latent_2d[y_test==1, 1], c='red', label="FAKE", alpha=0.5)
plt.legend()
plt.xlabel("TSNE-1")
plt.ylabel("TSNE-2")
plt.grid(True)
plt.tight_layout()
#plt.savefig("plots/hybrid-tsne_latent_space.png", dpi=600, bbox_inches='tight')
plt.show()


############################ CLEAN CODE  #################################################
