In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Anomaly Detection Product (single script) ‚Äî Extended:
- Residual creation (Demand - Measured -> Residual)
- Feature engineering (reuse if already present)
- Scaling once -> shared across models
- Models: IsolationForest, LOF, Dense AE, LSTM AE
- Dynamic thresholds (MAD) for all scores
- Voting (3+) + episodes (merged runs)
- NEW: Plain voting bar (vote_any = >=1 model)
- NEW: Hybrid scoring (weighted + robust-normalized score + MAD threshold)
- Episode explanations (primary signal, suspected sensor)
- Hardware mapping & root-cause scoring (lag/saturation/drift/vibe)
- Sensor ranking, clustering & heatmap
- Multi-page PDF Ops Report (now includes Hybrid + Plain voting bars)
- Config-driven (JSON) OR safe defaults (no args)
"""

import os
import json
import argparse
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Torch
import torch
import torch.nn as nn
import torch.optim as optim

# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


# =========================================================
# Utils
# =========================================================
def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def safe_name(name: str) -> str:
    return "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in str(name))


# =========================================================
# Residual creation (optional)
# =========================================================
def create_residuals_for_folder(
    in_folder: str,
    out_folder: str,
    demand_token: str = "Demand",
    measured_token: str = "Measured",
    residual_token: str = "Residual",
    skip_if_exists: bool = True,
    suffix: str = "_residual",
    logger=print,
) -> None:
    ensure_dir(out_folder)
    for file in os.listdir(in_folder):
        if not file.endswith(".csv"):
            continue

        in_path = os.path.join(in_folder, file)
        out_name = file.replace(".csv", f"{suffix}.csv")  # fixed
        out_path = os.path.join(out_folder, out_name)

        if skip_if_exists and os.path.exists(out_path):
            logger(f"‚Ü©Ô∏è  Skip residual (exists): {out_name}")
            continue

        try:
            df = pd.read_csv(in_path)
        except Exception as e:
            logger(f"‚ùå Failed to read {file}: {e}")
            continue

        cols = df.columns.tolist()
        made_any = False
        for col in cols:
            if demand_token in col:
                measured_col = col.replace(demand_token, measured_token)
                if measured_col in df.columns:
                    residual_col = col.replace(demand_token, residual_token)
                    df[residual_col] = df[col] - df[measured_col]
                    made_any = True

        if not made_any:
            logger(f"‚ö†Ô∏è  No Demand/Measured pairs found in {file}.")
        df.to_csv(out_path, index=False)
        logger(f"‚úÖ Residual CSV saved: {os.path.basename(out_path)}")


# =========================================================
# Scaling + robust threshold (MAD)
# =========================================================
def scale_features(X: pd.DataFrame, use_float32: bool = True):
    """
    Standardize features once and share across models.
    Returns (scaler, X_scaled np.array, X_tensor torch.tensor)
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    if use_float32:
        X_scaled = X_scaled.astype("float32")
    X_tensor = torch.from_numpy(X_scaled)
    return scaler, X_scaled, X_tensor


def robust_threshold(
    values: np.ndarray,
    k: float = 3.5,
    tail: str = "high",
    min_anoms: int = 5,
) -> Tuple[float, np.ndarray]:
    """
    MAD-based threshold: median ¬± k * 1.4826 * MAD
    tail = 'high' (right tail) or 'low' (left tail)
    Returns: (threshold, labels) labels aligned to 'values' (1=anomaly)
    """
    v = np.asarray(values)
    mask = ~np.isnan(v)
    v = v[mask]
    if v.size == 0:
        return (np.inf if tail == "high" else -np.inf), np.zeros_like(values, dtype=int)

    med = np.median(v)
    mad = np.median(np.abs(v - med)) + 1e-12
    if tail == "high":
        thr = med + k * 1.4826 * mad
        labels = (values > thr).astype(int)
    else:
        thr = med - k * 1.4826 * mad
        labels = (values < thr).astype(int)

    # relax if too strict on large arrays
    if labels.sum() < min_anoms and v.size >= 100:
        for k_relax in (3.0, 2.5, 2.0):
            if tail == "high":
                thr = med + k_relax * 1.4826 * mad
                labels = (values > thr).astype(int)
            else:
                thr = med - k_relax * 1.4826 * mad
                labels = (values < thr).astype(int)
            if labels.sum() >= min_anoms:
                break

    return thr, labels


# =========================================================
# Feature Engineering
# =========================================================
def prepare_features(
    df: pd.DataFrame,
    residual_cols: List[str],
    window: int = 5,
    max_features: int = 500,
    logger=print,
) -> Tuple[pd.DataFrame, List[str], Dict[str, int]]:
    """
    Create or reuse features: residual, delta, rolling mean/std
    Returns: X, feature_cols, stats (reused vs generated)
    """
    already_done = any(f"{residual_cols[0]}_delta" in df.columns for _ in residual_cols)
    stats = {"reused": 0, "generated": 0}

    if already_done:
        feature_cols = [
            c for c in df.columns
            if any(k in c for k in ["Residual", "_delta", "_rolling_mean", "_rolling_std"])
        ]
        X = df[feature_cols].dropna()
        stats["reused"] = len(feature_cols)
        logger(f"üîÅ Reusing {len(feature_cols)} engineered features.")
        return X, feature_cols, stats

    # Generate
    for col in residual_cols:
        df[f"{col}_delta"] = df[col].diff()
        df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
        df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()

    feature_cols = []
    for col in residual_cols:
        feature_cols += [
            col,
            f"{col}_delta",
            f"{col}_rolling_mean_{window}",
            f"{col}_rolling_std_{window}",
        ]

    X = df[feature_cols].dropna()
    stats["generated"] = len(feature_cols)
    logger(f"üõ†Ô∏è  Generated {len(feature_cols)} features (window={window}).")

    if X.shape[1] > max_features:
        logger(f"‚ùå Too many features ({X.shape[1]} > {max_features}). Skipping file.")
        return pd.DataFrame(), [], stats

    return X, feature_cols, stats


# =========================================================
# Models
# =========================================================
class Autoencoder(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 32), nn.ReLU(), nn.Linear(32, 8))
        self.decoder = nn.Sequential(nn.Linear(8, 32), nn.ReLU(), nn.Linear(32, input_dim))

    def forward(self, x):
        return self.decoder(self.encoder(x))


def dense_autoencoder_detect(
    X_tensor: torch.Tensor, k: float, ae_epochs: int, ae_lr: float
) -> Tuple[np.ndarray, np.ndarray, float]:
    model = Autoencoder(X_tensor.shape[1])
    opt = optim.Adam(model.parameters(), lr=ae_lr)
    crit = nn.MSELoss()

    for _ in range(ae_epochs):
        opt.zero_grad()
        out = model(X_tensor)
        loss = crit(out, X_tensor)
        loss.backward()
        opt.step()

    with torch.no_grad():
        rec = model(X_tensor)
        errors = torch.mean((X_tensor - rec) ** 2, dim=1).cpu().numpy()

    thr, labels = robust_threshold(errors, k=k, tail="high")
    return labels.astype(int), errors, thr


class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        _, (h, _) = self.encoder(x)  # [1, B, H]
        repeated = h.repeat(x.size(1), 1, 1).transpose(0, 1)  # [B, T, H]
        decoded, _ = self.decoder(repeated)
        return decoded


def make_sequences(X: np.ndarray, seq_len: int) -> Tuple[np.ndarray, List[int]]:
    seqs, idxs = [], []
    for i in range(len(X) - seq_len):
        seqs.append(X[i:i+seq_len])
        idxs.append(i + seq_len - 1)
    return np.array(sesqs := seqs), idxs  # keep identical shape to previous code; sesqs to avoid linter


def lstm_autoencoder_detect(
    X_scaled: np.ndarray,
    k: float,
    seq_len: int,
    hidden_dim: int,
    patience: int,
    max_sequences: int,
    downsample: int,
) -> Tuple[np.ndarray, np.ndarray, List[int], float]:
    try:
        Xds = X_scaled[::downsample]
        if len(Xds) < seq_len:
            return np.array([]), np.array([]), [], np.nan

        Xseq, idxs = make_sequences(Xds, seq_len)
        if len(Xseq) > max_sequences:
            Xseq, idxs = Xseq[:max_sequences], idxs[:max_sequences]

        Xt = torch.tensor(Xseq, dtype=torch.float32)
        model = LSTMAutoencoder(Xt.shape[2], hidden_dim)
        opt = optim.Adam(model.parameters(), lr=1e-3)
        crit = nn.MSELoss()

        best, wait = float("inf"), 0
        for _ in range(100):
            model.train()
            opt.zero_grad()
            out = model(Xt)
            loss = crit(out, Xt)
            loss.backward()
            opt.step()
            if loss.item() < best:
                best, wait = loss.item(), 0
            else:
                wait += 1
                if wait >= patience:
                    break

        with torch.no_grad():
            model.eval()
            out = model(Xt)
            errors = torch.mean((Xt - out) ** 2, dim=(1, 2)).cpu().numpy()

        thr, labels = robust_threshold(errors, k=k, tail="high")
        return labels.astype(int), errors, idxs, thr
    except RuntimeError as e:
        print(f"‚ö†Ô∏è LSTM memory error: {e}")
        return np.array([]), np.array([]), [], np.nan


def isolation_forest_detect(X_scaled: np.ndarray, k: float) -> Tuple[np.ndarray, np.ndarray, float]:
    iso = IsolationForest(contamination="auto", n_estimators=300, random_state=42)
    iso.fit(X_scaled)
    scores = -iso.decision_function(X_scaled)  # higher = more anomalous
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


def lof_detect(X_scaled: np.ndarray, k: float, n_neighbors: int) -> Tuple[np.ndarray, np.ndarray, float]:
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination="auto")
    _ = lof.fit_predict(X_scaled)  # populates negative_outlier_factor_
    scores = -lof.negative_outlier_factor_
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


# =========================================================
# NEW: Hybrid scoring utilities
# =========================================================
def _robust_z_pos(x: np.ndarray) -> np.ndarray:
    """Right-tail robust z-score (>=0 when above median)."""
    x = np.asarray(x, dtype=float)
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med)) + 1e-12
    z = (x - med) / (1.4826 * mad)
    z = np.where(np.isnan(z), np.nan, z)
    return np.maximum(z, 0.0)  # only right tail counts as anomalous


def _percentile01(x: np.ndarray) -> np.ndarray:
    """Map to [0,1] by robust percentiles (2‚Äì98). Values outside clamp."""
    x = np.asarray(x, dtype=float)
    lo = np.nanpercentile(x, 2)
    hi = np.nanpercentile(x, 98)
    rng = max(hi - lo, 1e-12)
    y = (x - lo) / rng
    return np.clip(y, 0.0, 1.0)


def compute_hybrid_score(df: pd.DataFrame, cfg: dict) -> np.ndarray:
    """
    Combine model scores into a single 'hybrid_score' using robust normalization
    and weights. Higher = more anomalous.
    """
    if not cfg.get("hybrid", {}).get("enabled", False):
        return np.full(len(df), np.nan)

    wmap = cfg["hybrid"]["weights"]
    method = cfg["hybrid"]["method"]

    # Which score columns to use (must be "higher = more anomalous" already)
    components = [c for c in ["iso_score", "lof_score", "ae_error", "lstm_error"] if c in df.columns and c in wmap]
    if not components:
        return np.full(len(df), np.nan)

    normers = []
    for c in components:
        arr = df[c].to_numpy()
        if method == "robust_z":
            norm = _robust_z_pos(arr)
            norm = np.clip(norm, 0, 10.0) / 10.0  # compress tails, map ~[0,1]
        else:  # "percentile"
            norm = _percentile01(arr)
        normers.append((c, norm, float(wmap.get(c, 0.0))))

    # Weighted average ignoring NaNs
    num = np.zeros(len(df), dtype=float)
    den = np.zeros(len(df), dtype=float)
    for _, norm, w in normers:
        m = ~np.isnan(norm)
        num[m] += w * norm[m]
        den[m] += w
    out = np.where(den > 0, num / den, np.nan)
    return out


# =========================================================
# Voting, episodes, explanations
# =========================================================
def generate_votes(df: pd.DataFrame) -> pd.DataFrame:
    df["agreement_all_4"] = (
        (df.get("ae_is_anomaly", 0) == 1)
        & (df.get("is_anomaly", 0) == 1)
        & (df.get("lof_is_anomaly", 0) == 1)
        & (df.get("lstm_is_anomaly", 0) == 1)
    ).astype(int)
    df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
    df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
    df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # NEW: plain voting (>=1)
    return df


def extract_voted_rows(df: pd.DataFrame, rule: str = "vote_3plus") -> pd.DataFrame:
    if rule == "vote_3plus":
        mask = df["vote_3plus"] == 1
    elif rule == "agreement_all_4":
        mask = df["agreement_all_4"] == 1
    elif rule == "any":
        mask = (
            (df["ae_is_anomaly"] == 1)
            | (df["is_anomaly"] == 1)
            | (df["lof_is_anomaly"] == 1)
            | (df["lstm_is_anomaly"] == 1)
        )
    else:
        raise ValueError(f"Unknown rule: {rule}")
    return df.loc[mask].copy()


def _group_runs(idxs: np.ndarray, min_gap: int = 1) -> List[Tuple[int, int]]:
    if len(idxs) == 0:
        return []
    runs, start, prev = [], int(idxs[0]), int(idxs[0])
    for i in idxs[1:]:
        if int(i) - prev <= min_gap:
            prev = int(i)
            continue
        runs.append((start, prev))
        start = int(i); prev = int(i)
    runs.append((start, prev))
    return runs


def summarize_episodes(voted_df: pd.DataFrame, min_gap: int = 1) -> pd.DataFrame:
    if voted_df.empty:
        return pd.DataFrame(columns=["source_file", "start_idx", "end_idx", "length", "n_models_mean"])

    idxs = voted_df.index.to_numpy()
    runs = _group_runs(idxs, min_gap=min_gap)

    rows = []
    for start, end in runs:
        chunk = voted_df.loc[start:end]
        row = {
            "source_file": chunk["source_file"].iloc[0] if "source_file" in chunk else "",
            "start_idx": start,
            "end_idx": end,
            "length": int(end - start + 1),
            "n_models_mean": float(
                chunk[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1).mean()
            ),
        }
        for c in ["iso_score", "ae_error", "lof_score", "lstm_error", "hybrid_score"]:
            if c in chunk.columns:
                row[f"{c}_max"] = float(chunk[c].max())
                row[f"{c}_mean"] = float(chunk[c].mean())
        rows.append(row)
    return pd.DataFrame(rows)


def _base_residual_columns(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def _models_string(chunk: pd.DataFrame) -> str:
    model_cols = [c for c in ["is_anomaly", "ae_is_anomaly", "lof_is_anomaly", "lstm_is_anomaly", "hybrid_is_anomaly"] if c in chunk.columns]
    if not model_cols:
        return "no-model-flags"
    means = chunk[model_cols].mean()
    active = [m.replace("_is_anomaly", "").upper() for m, v in means.items() if v >= 0.5]
    return ", ".join(active) if active else "weak/isolated flags"


def attach_episode_reasons(
    combined_df: pd.DataFrame, episodes_df: pd.DataFrame, top_k: int = 1
) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df

    base_res = _base_residual_columns(combined_df)
    if not base_res:
        episodes_df["primary_signal"] = ""
        episodes_df["reason"] = "no residual columns present"
        episodes_df["suspected_sensor"] = ""
        return episodes_df

    out = []
    for _, epi in episodes_df.iterrows():
        start, end = int(epi["start_idx"]), int(epi["end_idx"])
        mask = combined_df["source_file"] == epi["source_file"] if "source_file" in combined_df.columns else slice(None)
        chunk = combined_df.loc[mask].loc[start:end]

        if chunk.empty:
            epi["primary_signal"] = ""
            epi["reason"] = "empty slice"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats = []
        for col in base_res:
            if col in chunk.columns:
                stats.append((col, float(chunk[col].abs().max())))
        if not stats:
            epi["primary_signal"] = ""
            epi["reason"] = "no residual stats"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats.sort(key=lambda x: x[1], reverse=True)
        primary_signal, primary_val = stats[:top_k][0]
        models_str = _models_string(chunk)
        measured_col = primary_signal.replace("Residual", "Measured")
        suspected = measured_col if (measured_col in combined_df.columns) else "unknown-measured-sensor"

        epi["primary_signal"] = primary_signal
        epi["reason"] = f"max |{primary_signal}| = {primary_val:.3f}; models: {models_str}"
        epi["suspected_sensor"] = suspected
        out.append(epi)

    return pd.DataFrame(out)


# =========================================================
# Hardware mapping + root cause scoring
# =========================================================
HARDWARE_MAP = [
    ("Force_",         "Actuator/LoadCell",  "Force didn‚Äôt follow demand ‚Üí friction/lag/saturation/load-cell drift likely"),
    ("Encoder_",       "Encoders/Alignment", "Pose/velocity mismatch ‚Üí quantization/missing counts/misalignment"),
    ("Accelerometer_", "IMU/Accelerometer",  "Vibration bursts ‚Üí mounting/looseness/thermal drift"),
    ("State_",         "Control/Timing",     "Requested vs achieved state diverged ‚Üí scheduler limits/controller windup"),
]

def map_signal_to_hardware(primary_signal: str):
    for needle, hw, why in HARDWARE_MAP:
        if needle in primary_signal:
            return hw, why
    return "Unknown", "No mapping rule matched"


def enrich_hardware_mapping(episodes_df: pd.DataFrame) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df
    episodes_df = episodes_df.copy()
    episodes_df["hardware_class"] = ""
    episodes_df["hardware_why"] = ""
    for i, r in episodes_df.iterrows():
        hw, why = map_signal_to_hardware(r.get("primary_signal", ""))
        episodes_df.at[i, "hardware_class"] = hw
        episodes_df.at[i, "hardware_why"]   = why
    return episodes_df


def _paired_columns(primary_signal: str, cfg: dict) -> Tuple[Optional[str], Optional[str]]:
    resid_tok  = cfg["signals"]["residual_token"]
    demand_tok = cfg["signals"]["demand_token"]
    measured_tok = cfg["signals"]["measured_token"]
    if resid_tok not in primary_signal:
        return None, None
    demand_col   = primary_signal.replace(resid_tok, demand_tok)
    measured_col = primary_signal.replace(resid_tok, measured_tok)
    return demand_col, measured_col


def _nan_ok(arr: np.ndarray) -> np.ndarray:
    return np.asarray(arr, dtype=float)


def _cross_correlation_lag(x: np.ndarray, y: np.ndarray, sample_rate_hz: Optional[float]) -> Tuple[float, int]:
    x = _nan_ok(x); y = _nan_ok(y)
    if len(x) != len(y) or len(x) == 0:
        return (np.nan, 0)
    x = x - np.nanmean(x); y = y - np.nanmean(y)
    x = np.nan_to_num(x);  y = np.nan_to_num(y)
    corr = np.correlate(x, y, mode="full")
    lags = np.arange(-len(x)+1, len(x))
    k = int(np.argmax(corr))
    lag_samples = int(lags[k])
    lag_seconds = lag_samples / sample_rate_hz if sample_rate_hz and sample_rate_hz > 0 else np.nan
    return (lag_seconds, lag_samples)


def _saturation_score(demand: np.ndarray, residual: np.ndarray, cfg: dict) -> float:
    if len(demand) == 0 or len(residual) == 0:
        return 0.0
    p_dem = np.nanpercentile(demand, cfg["scores"]["saturation_pct"])
    p_res = np.nanpercentile(np.abs(residual), cfg["scores"]["resid_prominence_pct"])
    near_limit = demand >= p_dem
    large_res  = np.abs(residual) >= p_res
    both = np.logical_and(near_limit, large_res)
    return float(np.nansum(both)) / max(1, len(demand))


def _drift_score(residual: np.ndarray) -> float:
    residual = _nan_ok(residual)
    mu = float(np.nanmean(residual))
    sd = float(np.nanstd(residual)) + 1e-9
    return abs(mu) / sd


def _vibration_score(signal: np.ndarray, sample_rate_hz: Optional[float]) -> float:
    if not sample_rate_hz or sample_rate_hz <= 0 or len(signal) < 8:
        return np.nan
    sig = np.nan_to_num(signal - np.nanmean(signal))
    fft = np.fft.rfft(sig)
    power = np.abs(fft) ** 2
    freqs = np.fft.rfftfreq(len(sig), d=1.0 / sample_rate_hz)
    if len(freqs) == 0:
        return np.nan
    cutoff = 0.25 * (sample_rate_hz / 2.0)  # > Nyquist/4
    mask_hi = freqs >= cutoff
    num = float(np.nansum(power[mask_hi]))
    den = float(np.nansum(power) + 1e-12)
    return num / den


def score_episodes(combined_df: pd.DataFrame, episodes_df: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    """
    Adds: lag_seconds, lag_samples, saturation_score, drift_score, vibe_score
    """
    if episodes_df.empty:
        return episodes_df
    out = episodes_df.copy()
    sr = cfg["signals"]["sample_rate_hz"]
    min_len = cfg["scores"]["min_window_len"]

    if "primary_signal" not in out.columns:
        out["primary_signal"] = ""

    for i, r in out.iterrows():
        start, end = int(r["start_idx"]), int(r["end_idx"])
        if end - start + 1 < min_len:
            out.at[i, "lag_seconds"] = np.nan
            out.at[i, "lag_samples"] = 0
            out.at[i, "saturation_score"] = 0.0
            out.at[i, "drift_score"] = 0.0
            out.at[i, "vibe_score"] = np.nan
            continue

        if "source_file" in combined_df.columns and "source_file" in out.columns and "source_file" in r:
            chunk = combined_df.loc[(combined_df["source_file"] == r["source_file"])].loc[start:end]
        else:
            chunk = combined_df.loc[start:end]

        primary = r.get("primary_signal", "")
        demand_col, measured_col = _paired_columns(primary, cfg)

        resid = chunk[primary].values if (primary in chunk.columns) else np.array([])
        dem   = chunk[demand_col].values if (demand_col and demand_col in chunk.columns) else np.array([])
        meas  = chunk[measured_col].values if (measured_col and measured_col in chunk.columns) else np.array([])

        lag_s, lag_k = _cross_correlation_lag(dem, meas, sr) if (len(dem) and len(meas)) else (np.nan, 0)
        sat_sc = _saturation_score(dem, resid, cfg) if (len(dem) and len(resid)) else 0.0
        dr_sc  = _drift_score(resid) if len(resid) else 0.0
        if "Accelerometer_" in primary and primary in chunk.columns:
            vibe_sc = _vibration_score(chunk[primary].values, sr)
        else:
            vibe_sc = _vibration_score(resid, sr)

        out.at[i, "lag_seconds"]       = lag_s
        out.at[i, "lag_samples"]       = int(lag_k)
        out.at[i, "saturation_score"]  = float(sat_sc)
        out.at[i, "drift_score"]       = float(dr_sc)
        out.at[i, "vibe_score"]        = float(vibe_sc) if vibe_sc == vibe_sc else np.nan
    return out


# =========================================================
# Plotting helpers (per-file voted overlays)
# =========================================================
def _pick_residual(df: pd.DataFrame) -> Optional[str]:
    cand = [c for c in df.columns if "Residual" in c and not any(t in c for t in ["_delta", "_rolling_"])]
    return cand[0] if cand else None


def plot_voted_for_file(
    df_file: pd.DataFrame,
    out_dir: str,
    rule: str,
    min_gap: int,
    figsize: Tuple[int, int] = (12, 5),
) -> Optional[str]:
    ensure_dir(out_dir)
    residual_col = _pick_residual(df_file)
    if residual_col is None:
        print("‚ö†Ô∏è No residual column to plot.")
        return None

    voted_rows = extract_voted_rows(df_file, rule=rule)
    episodes = summarize_episodes(voted_rows, min_gap=min_gap)

    plt.figure(figsize=figsize)
    plt.plot(df_file.index, df_file[residual_col], label=residual_col, alpha=0.85)

    if not voted_rows.empty:
        plt.scatter(voted_rows.index, voted_rows[residual_col], s=12, label=f"Voted anomalies ({rule})")

    if not episodes.empty:
        for _, r in episodes.iterrows():
            plt.axvspan(r["start_idx"], r["end_idx"], alpha=0.15, label="Episode")
        handles, labels = plt.gca().get_legend_handles_labels()
        uniq, seen = [], set()
        for h, l in zip(handles, labels):
            if l not in seen:
                uniq.append((h, l)); seen.add(l)
        handles, labels = zip(*uniq)
        plt.legend(handles, labels)
    else:
        plt.legend()

    sf = df_file["source_file"].iloc[0] if "source_file" in df_file.columns else "file"
    plt.title(f"{sf} ‚Äî Residual with voted anomalies & episodes")
    plt.xlabel("Index"); plt.ylabel(residual_col)
    plt.tight_layout()
    out_path = os.path.join(out_dir, f"voted_plot_{safe_name(sf)}.png")
    plt.savefig(out_path, dpi=160); plt.close()
    return out_path


def plot_all_files(combined_df: pd.DataFrame, out_dir: str, rule: str, min_gap: int, max_files: Optional[int] = None):
    paths = []
    if "source_file" not in combined_df.columns:
        print("‚ö†Ô∏è combined_df missing 'source_file'.")
        return paths
    groups = list(combined_df.groupby("source_file"))
    if max_files is not None:
        groups = groups[:max_files]
    for fname, df_file in groups:
        p = plot_voted_for_file(df_file, out_dir=out_dir, rule=rule, min_gap=min_gap)
        if p:
            paths.append(p); print(f"üñºÔ∏è Saved: {p}")
    if not paths:
        print("‚ö†Ô∏è No plots produced.")
    return paths


# =========================================================
# Sensor attribution, clustering & heatmap
# =========================================================
def _residual_cols_base(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def build_sensor_table(
    combined: pd.DataFrame,
    voted_rows: pd.DataFrame,
    episodes_with_reasons: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    base_res = _residual_cols_base(combined)
    if not base_res:
        return pd.DataFrame()

    total_rows = len(combined)
    voted_mask = pd.Series(False, index=combined.index)
    if not voted_rows.empty:
        voted_mask.loc[voted_rows.index] = True

    rows = []
    expected_keys = [
        "anomaly_rate_is",
        "anomaly_rate_ae",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "anomaly_rate_hybrid",    # NEW
        "anomaly_rate_vote3p",
        "anomaly_rate_vote_any",  # NEW
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
        "episodes_as_primary",
    ]

    for col in base_res:
        stats = {"sensor": col}

        # Model rates (per total rows)
        stats["anomaly_rate_is"]   = float(combined["is_anomaly"].sum())   / max(total_rows, 1) if "is_anomaly"   in combined.columns else 0.0
        stats["anomaly_rate_ae"]   = float(combined["ae_is_anomaly"].sum())/ max(total_rows, 1) if "ae_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lof"]  = float(combined["lof_is_anomaly"].sum())/max(total_rows, 1) if "lof_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lstm"] = float(combined["lstm_is_anomaly"].fillna(0).sum())/max(total_rows, 1) if "lstm_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_hybrid"] = float(combined["hybrid_is_anomaly"].sum())/max(total_rows, 1) if "hybrid_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_vote3p"] = float(combined["vote_3plus"].sum())/max(total_rows, 1) if "vote_3plus" in combined.columns else 0.0
        stats["anomaly_rate_vote_any"] = float(combined["vote_any"].sum())/max(total_rows, 1) if "vote_any" in combined.columns else 0.0

        # Mean/Max |residual| during voted anomalies
        if col in combined.columns and voted_mask.any():
            vals = combined.loc[voted_mask, col].abs()
            stats["mean_abs_resid_voted"] = float(vals.mean()) if not vals.empty else 0.0
            stats["max_abs_resid_voted"]  = float(vals.max())  if not vals.empty else 0.0
        else:
            stats["mean_abs_resid_voted"] = 0.0
            stats["max_abs_resid_voted"]  = 0.0

        # Episodes where this residual was primary
        if episodes_with_reasons is not None and not episodes_with_reasons.empty and "primary_signal" in episodes_with_reasons.columns:
            stats["episodes_as_primary"] = int((episodes_with_reasons["primary_signal"] == col).sum())
        else:
            stats["episodes_as_primary"] = 0

        for k in expected_keys:
            stats.setdefault(k, 0.0)

        rows.append(stats)

    sensor_df = pd.DataFrame(rows)
    for c in sensor_df.columns:
        if c != "sensor":
            sensor_df[c] = sensor_df[c].fillna(0.0)
    return sensor_df


def cluster_sensors(
    sensor_df: pd.DataFrame,
    n_clusters: int = 3,
    random_state: int = 42,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    if sensor_df.empty or "sensor" not in sensor_df.columns:
        return sensor_df, np.empty((0, 2)), np.empty((0, 2))

    features = sensor_df.drop(columns=["sensor"]).to_numpy(dtype=np.float32)
    if features.shape[0] < n_clusters:
        n_clusters = max(1, features.shape[0])

    scaler = StandardScaler()
    Z = scaler.fit_transform(features)

    km = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state)
    labels = km.fit_predict(Z)

    pca = PCA(n_components=2, random_state=random_state)
    Z2 = pca.fit_transform(Z)
    centers2 = pca.transform(km.cluster_centers_)

    out = sensor_df.copy()
    out["cluster"] = labels

    return out, Z2, centers2


def plot_sensor_bar_top(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metric: str = "episodes_as_primary",
    top_n: int = 15,
    title: Optional[str] = None,
) -> Optional[str]:
    if sensor_df.empty or metric not in sensor_df.columns:
        return None

    ensure_dir(out_dir)
    df = sensor_df.sort_values(metric, ascending=False).head(top_n)

    plt.figure(figsize=(12, 6))
    plt.bar(range(len(df)), df[metric])
    plt.xticks(range(len(df)), [s.replace("Force_", "F_") for s in df["sensor"]], rotation=60, ha="right")
    plt.ylabel(metric)
    plt.title(title or f"Top {top_n} sensors by {metric}")
    plt.tight_layout()

    path = os.path.join(out_dir, f"top_sensors_{metric}.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_clusters_scatter(
    sensor_df_with_cluster: pd.DataFrame,
    Z2: np.ndarray,
    centers2: np.ndarray,
    out_dir: str,
    title: str = "Sensor clusters (PCA of features)",
) -> Optional[str]:
    if sensor_df_with_cluster.empty or Z2.size == 0:
        return None

    ensure_dir(out_dir)
    plt.figure(figsize=(9, 7))

    clusters = sorted(sensor_df_with_cluster["cluster"].unique().tolist())
    for cl in clusters:
        mask = sensor_df_with_cluster["cluster"] == cl
        pts = Z2[mask.values]
        plt.scatter(pts[:, 0], pts[:, 1], label=f"cluster {cl}", alpha=0.8, s=36)

    if centers2.size:
        plt.scatter(centers2[:, 0], centers2[:, 1], marker="X", s=120, label="centers")

    try:
        top_lab = sensor_df_with_cluster.sort_values("episodes_as_primary", ascending=False).head(10).index
        for idx in top_lab:
            plt.text(Z2[idx, 0], Z2[idx, 1], sensor_df_with_cluster.loc[idx, "sensor"], fontsize=8)
    except Exception:
        pass

    plt.title(title)
    plt.xlabel("PCA-1"); plt.ylabel("PCA-2")
    plt.legend()
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_clusters_pca.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_heatmap(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metrics: Optional[List[str]] = None,
    title: str = "Sensor anomaly fingerprint (rates & magnitudes)",
) -> Optional[str]:
    if sensor_df.empty:
        return None
    ensure_dir(out_dir)

    desired = [
        "anomaly_rate_vote3p",
        "anomaly_rate_vote_any",  # NEW
        "anomaly_rate_hybrid",    # NEW
        "anomaly_rate_ae",
        "anomaly_rate_is",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
    ]
    if metrics is None:
        metrics = desired

    available = [m for m in metrics if m in sensor_df.columns]
    if not available:
        print("‚ö†Ô∏è No requested heatmap metrics are present in sensor_df. Skipping heatmap.")
        return None
    if len(available) < len(metrics):
        missing = [m for m in metrics if m not in sensor_df.columns]
        print(f"‚ÑπÔ∏è Skipping missing metrics in heatmap: {missing}")
    metrics = available

    key_rank = "episodes_as_primary" if "episodes_as_primary" in sensor_df.columns else metrics[0]
    keep = sensor_df.sort_values(key_rank, ascending=False).head(25)

    M = keep[metrics].to_numpy(dtype=np.float32)
    plt.figure(figsize=(12, 8))
    plt.imshow(M, aspect="auto")
    plt.colorbar()
    plt.yticks(range(len(keep)), keep["sensor"])
    plt.xticks(range(len(metrics)), metrics, rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_fingerprint_heatmap.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


# =========================================================
# Report
# =========================================================
def _overlay_episode_plot(df: pd.DataFrame, episode_row: pd.Series, cfg: dict, ax=None):
    start, end = int(episode_row["start_idx"]), int(episode_row["end_idx"])
    primary = episode_row.get("primary_signal", "")
    demand_col, measured_col = _paired_columns(primary, cfg)
    if ax is None:
        ax = plt.gca()

    t = np.arange(start, end + 1)
    if primary in df.columns:
        ax.plot(t, df.loc[start:end, primary].values, label=f"{primary}", alpha=0.85)
    if demand_col and demand_col in df.columns:
        ax.plot(t, df.loc[start:end, demand_col].values, label=f"{demand_col}", alpha=0.8)
    if measured_col and measured_col in df.columns:
        ax.plot(t, df.loc[start:end, measured_col].values, label=f"{measured_col}", alpha=0.8)

    ax.set_xlabel("Index")
    ax.set_title(f"Episode {start}‚Äì{end}\nprimary={primary}")
    ax.legend(loc="best")


def build_ops_report(
    combined: pd.DataFrame,
    summary: pd.DataFrame,
    sensor_df: pd.DataFrame,
    episodes_scored: pd.DataFrame,
    cfg: dict,
    out_pdf_path: str
):
    ensure_dir(os.path.dirname(out_pdf_path))
    with PdfPages(out_pdf_path) as pdf:

        # Page 1 ‚Äî Anomalies counts by model (dynamic columns incl. Hybrid + Plain voting)
        plt.figure(figsize=(11, 6))
        plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus","vote_any"] if c in summary.columns]
        summary[plot_cols].plot(kind="bar")
        plt.title("Anomalies per Model per File")
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        pdf.savefig(); plt.close()

        # Page 2 ‚Äî Top sensors by episodes_as_primary
        p1 = plot_sensor_bar_top(sensor_df, out_dir=cfg["io"]["output_folder"], metric="episodes_as_primary", top_n=15,
                                 title="Top sensors by episodes_as_primary")
        if p1 and os.path.exists(p1):
            img = plt.imread(p1)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Page 3 ‚Äî Sensor heatmap (if created)
        p2 = plot_sensor_heatmap(sensor_df, out_dir=cfg["io"]["output_folder"])
        if p2 and os.path.exists(p2):
            img = plt.imread(p2)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Pages 4+ ‚Äî Example episode overlays
        if not episodes_scored.empty:
            candidates = episodes_scored.copy()
            if "n_models_mean" in candidates.columns:
                candidates = candidates.sort_values(["n_models_mean"], ascending=False)
            n_show = min(cfg["report"]["top_n_episodes"], len(candidates))
            for _, epi in candidates.head(n_show).iterrows():
                plt.figure(figsize=(11, 5))
                if "source_file" in combined.columns and "source_file" in epi:
                    sub = combined.loc[combined["source_file"] == epi["source_file"]]
                else:
                    sub = combined
                _overlay_episode_plot(sub, epi, cfg, ax=plt.gca())
                hw = epi.get("hardware_class", "Unknown")
                why = epi.get("hardware_why", "")
                lag_s = epi.get("lag_seconds", np.nan)
                sat   = epi.get("saturation_score", np.nan)
                drift = epi.get("drift_score", np.nan)
                vibe  = epi.get("vibe_score", np.nan)
                txt = (
                    f"hardware: {hw}\n"
                    f"why: {why}\n"
                    f"lag_seconds: {lag_s:.4f}  |  saturation: {sat:.3f}  |  drift: {drift:.3f}  |  vibe: {vibe:.3f}"
                )
                plt.gcf().text(0.02, 0.02, txt, ha="left", va="bottom", fontsize=9)
                plt.tight_layout()
                pdf.savefig(); plt.close()


# =========================================================
# Config (defaults or JSON)
# =========================================================
def default_config() -> dict:
    return {
        "io": {
            "input_folder": "./Datasets/Datasets",
            "residual_folder": "./Anomaly_detection/residual_created/",
            "output_folder": "./Anomaly_detection/code/outputs/"
        },
        "residuals": {
            "enabled": True,
            "demand_token": "Demand",
            "measured_token": "Measured",
            "residual_token": "Residual",
            "suffix": "_residual"
        },
        "features": {
            "window": 5,
            "max_features": 500
        },
        "threshold": {
            "k": 3.5
        },
        "ae": {
            "epochs": 50,
            "lr": 0.001
        },
        "lstm": {
            "seq_len": 5,
            "hidden_dim": 64,
            "patience": 5,
            "max_sequences": 3000,
            "downsample": 5
        },
        "lof": {
            "n_neighbors": 20
        },
        "hybrid": {  # NEW
            "enabled": True,
            "method": "robust_z",   # "robust_z" | "percentile"
            "weights": {
                "iso_score": 0.25,
                "lof_score": 0.25,
                "ae_error": 0.25,
                "lstm_error": 0.25
            }
        },
        "voting": {
            "rule": "vote_3plus",    # "vote_3plus" | "agreement_all_4" | "any"
            "min_gap": 1
        },
        "plots": {
            "enabled": True,
            "max_files": None
        },
        "runtime": {
            "use_float32": True
        },
        "signals": {
            "sample_rate_hz": 100.0,     # set None if unknown
            "residual_token": "Residual",
            "demand_token": "Demand",
            "measured_token": "Measured"
        },
        "scores": {
            "saturation_pct": 95.0,
            "resid_prominence_pct": 95.0,
            "min_window_len": 5
        },
        "report": {
            "enabled": True,
            "top_n_episodes": 3
        }
    }


def load_config_from_path_or_default(path: Optional[str]) -> dict:
    if path and os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    print("‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.")
    return default_config()


# =========================================================
# Per-file processing & Pipeline
# =========================================================
def process_file(file_path: str, cfg: Dict, logger=print) -> Optional[pd.DataFrame]:
    df = pd.read_csv(file_path)
    file_name = os.path.basename(file_path).replace(".csv", "")

    residual_cols = [c for c in df.columns if "Residual" in c]
    if not residual_cols:
        logger(f"‚ùå Skipped {file_name}: No residuals found.")
        return None

    X, feature_cols, fe_stats = prepare_features(
        df, residual_cols,
        window=cfg["features"]["window"],
        max_features=cfg["features"]["max_features"],
        logger=logger,
    )
    if X is None or len(feature_cols) == 0 or X.empty:
        logger(f"‚ùå Skipped {file_name}: invalid or empty features")
        return None

    scaler, X_scaled, X_tensor = scale_features(X, use_float32=cfg["runtime"]["use_float32"])

    iso_labels, iso_scores, iso_thr = isolation_forest_detect(X_scaled, k=cfg["threshold"]["k"])
    df.loc[X.index, "is_anomaly"] = iso_labels
    df.loc[X.index, "iso_score"] = iso_scores
    df.loc[X.index, "iso_thr"] = iso_thr

    ae_labels, ae_errors, ae_thr = dense_autoencoder_detect(
        X_tensor, k=cfg["threshold"]["k"], ae_epochs=cfg["ae"]["epochs"], ae_lr=cfg["ae"]["lr"]
    )
    df.loc[X.index, "ae_is_anomaly"] = ae_labels
    df.loc[X.index, "ae_error"] = ae_errors
    df.loc[X.index, "ae_thr"] = ae_thr

    lof_labels, lof_scores, lof_thr = lof_detect(
        X_scaled, k=cfg["threshold"]["k"], n_neighbors=cfg["lof"]["n_neighbors"]
    )
    df.loc[X.index, "lof_is_anomaly"] = lof_labels
    df.loc[X.index, "lof_score"] = lof_scores
    df.loc[X.index, "lof_thr"] = lof_thr

    lstm_labels, lstm_errors, lstm_idx, lstm_thr = lstm_autoencoder_detect(
        X_scaled,
        k=cfg["threshold"]["k"],
        seq_len=cfg["lstm"]["seq_len"],
        hidden_dim=cfg["lstm"]["hidden_dim"],
        patience=cfg["lstm"]["patience"],
        max_sequences=cfg["lstm"]["max_sequences"],
        downsample=cfg["lstm"]["downsample"],
    )
    if len(lstm_idx) > 0:
        df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
        df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
        df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
    else:
        df["lstm_is_anomaly"] = 0
        df["lstm_error"] = np.nan
        df["lstm_thr"] = np.nan

    # --- NEW: Hybrid score (weighted fusion)
    hybrid = compute_hybrid_score(df, cfg)
    df["hybrid_score"] = hybrid
    hs = df.loc[X.index, "hybrid_score"].to_numpy()
    if np.isnan(hs).all():
        df.loc[X.index, "hybrid_is_anomaly"] = 0
        df.loc[X.index, "hybrid_thr"] = np.nan
    else:
        hthr, hlabels = robust_threshold(hs, k=cfg["threshold"]["k"], tail="high")
        df.loc[X.index, "hybrid_is_anomaly"] = hlabels.astype(int)
        df.loc[X.index, "hybrid_thr"] = hthr

    df = generate_votes(df)  # includes vote_3plus + vote_any
    df["source_file"] = file_name
    df["fe_reused"] = fe_stats.get("reused", 0)
    df["fe_generated"] = fe_stats.get("generated", 0)

    logger(
        f"[{file_name}] iso={int(df['is_anomaly'].sum())} | "
        f"ae={int(df['ae_is_anomaly'].sum())} | "
        f"lof={int(df['lof_is_anomaly'].sum())} | "
        f"lstm={int(df['lstm_is_anomaly'].fillna(0).sum())} | "
        f"hyb={int(df['hybrid_is_anomaly'].sum())} | "
        f"vote3+={int(df['vote_3plus'].sum())} | any={int(df['vote_any'].sum())}"
    )
    return df


def run_pipeline(cfg: Dict):
    logger = print

    # A) residuals (optional)
    if cfg["residuals"]["enabled"]:
        logger("üîß Creating residuals...")
        create_residuals_for_folder(
            in_folder=cfg["io"]["input_folder"],
            out_folder=cfg["io"]["residual_folder"],
            demand_token=cfg["residuals"]["demand_token"],
            measured_token=cfg["residuals"]["measured_token"],
            residual_token=cfg["residuals"]["residual_token"],
            skip_if_exists=True,
            suffix=cfg["residuals"]["suffix"],
            logger=logger,
        )
        data_folder = cfg["io"]["residual_folder"]
    else:
        data_folder = cfg["io"]["input_folder"]

    # B) per-file
    all_dfs = []
    for file in os.listdir(data_folder):
        if file.endswith(".csv"):
            out = process_file(os.path.join(data_folder, file), cfg, logger=logger)
            if out is not None:
                all_dfs.append(out)

    if not all_dfs:
        logger("‚ùå No files processed.")
        return

    combined = pd.concat(all_dfs, ignore_index=True)
    ensure_dir(cfg["io"]["output_folder"])

    combined_path = os.path.join(cfg["io"]["output_folder"], "combined_anomaly_results.csv")
    combined.to_csv(combined_path, index=False)

    cols = ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus","vote_any"]
    cols = [c for c in cols if c in combined.columns]
    summary = combined.groupby("source_file")[cols].sum()
    summary["total_anomalies"] = summary.sum(axis=1)
    summary_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_summary.csv")
    summary.to_csv(summary_path)

    logger(f"‚úÖ Saved row-level: {combined_path}")
    logger(f"‚úÖ Saved summary:   {summary_path}")

    # C) Counts plot (dynamic cols incl. Hybrid + Plain voting)
    plt.figure(figsize=(12, 6))
    plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus","vote_any"] if c in summary.columns]
    summary[plot_cols].plot(kind="bar", figsize=(12, 6))
    plt.title("Anomalies per Model per File")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    bar_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_plot.png")
    plt.savefig(bar_path); plt.close()
    logger(f"üñºÔ∏è Saved: {bar_path}")

    # D) Voted rows + episodes + reasons
    voted_rows = extract_voted_rows(combined, rule=cfg["voting"]["rule"])
    voted_dir = os.path.join(cfg["io"]["output_folder"], "voted_outputs")
    ensure_dir(voted_dir)
    voted_rows_path = os.path.join(voted_dir, "voted_anomalies_rows.csv")
    voted_rows.to_csv(voted_rows_path, index=False)

    episodes = summarize_episodes(voted_rows, min_gap=cfg["voting"]["min_gap"])
    episodes_path = os.path.join(voted_dir, "voted_anomaly_episodes.csv")
    episodes.to_csv(episodes_path, index=False)

    episodes_with_reasons = attach_episode_reasons(combined, episodes, top_k=1)
    episodes_with_reasons = enrich_hardware_mapping(episodes_with_reasons)
    episodes_scored = score_episodes(combined, episodes_with_reasons, cfg)

    episodes_reason_path = os.path.join(voted_dir, "voted_anomaly_episodes_with_reasons.csv")
    episodes_scored_path = os.path.join(voted_dir, "voted_anomaly_episodes_with_reasons_and_scores.csv")
    episodes_with_reasons.to_csv(episodes_reason_path, index=False)
    episodes_scored.to_csv(episodes_scored_path, index=False)
    logger(f"‚úÖ Saved episodes+reason: {episodes_reason_path}")
    logger(f"‚úÖ Saved episodes+scores: {episodes_scored_path}")

    # E) Per-file plots with voted overlays (optional)
    if cfg["plots"]["enabled"]:
        _ = plot_all_files(
            combined_df=combined,
            out_dir=voted_dir,
            rule=cfg["voting"]["rule"],
            min_gap=cfg["voting"]["min_gap"],
            max_files=cfg["plots"]["max_files"],
        )

    # F) Sensor table + clustering visuals
    sensor_df = build_sensor_table(combined, voted_rows, episodes_with_reasons=episodes_with_reasons)
    sensor_df_path = os.path.join(voted_dir, "sensor_table.csv")
    sensor_df.to_csv(sensor_df_path, index=False)
    logger(f"‚úÖ Saved sensor table: {sensor_df_path}")

    clustered, Z2, centers2 = cluster_sensors(sensor_df, n_clusters=3, random_state=42)
    _ = plot_sensor_clusters_scatter(clustered, Z2, centers2, out_dir=voted_dir)
    _ = plot_sensor_heatmap(sensor_df, out_dir=voted_dir)
    _ = plot_sensor_bar_top(sensor_df, out_dir=voted_dir, metric="episodes_as_primary", top_n=15)

    # G) PDF report
    if cfg.get("report", {}).get("enabled", True):
        pdf_path = os.path.join(cfg["io"]["output_folder"], "ops_report.pdf")
        build_ops_report(
            combined=combined,
            summary=summary,
            sensor_df=sensor_df,
            episodes_scored=episodes_scored,
            cfg=cfg,
            out_pdf_path=pdf_path
        )
        logger(f"üìÑ Ops report saved: {pdf_path}")


# =========================================================
# Entrypoint
# =========================================================
def main():
    parser = argparse.ArgumentParser(description="Anomaly Detection Product")
    parser.add_argument("--config", type=str, default=None, help="Path to config JSON")
    args, _ = parser.parse_known_args()  # allows notebook execution

    cfg = load_config_from_path_or_default(args.config)
    run_pipeline(cfg)


if __name__ == "__main__":
    main()


‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.
üîß Creating residuals...
‚Ü©Ô∏è  Skip residual (exists): Dataset01_Ski_CrossbeamYawNotPerforming_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset02_Matrix_Rocker4EncoderNotWorking_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset03_Wushu_YawTrapezoidNormal_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset04_Wushu_YawWaveletSqueak_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset05_Wushu_LaneChanges_ModelBump_residual.csv
‚ùå Failed to read Dataset07_Demo_Spa_GT.csv: No columns to parse from file
‚Ü©Ô∏è  Skip residual (exists): Dataset08_Demo_Jiggler_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset09_Demo_VerticalChirp_residual.csv
‚ùå Failed to read Dataset10_Demo_MillbrookHills.csv: No columns to parse from file


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  out = np.where(den > 0, num / den, np.nan)
  df["hybrid_score"] = hybrid
  df.loc[X.index, "hybrid_is_anomaly"] = hlabels.astype(int)
  df.loc[X.index, "hybrid_thr"] = hthr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # NEW: plain voting (>=1

[Dataset01_Ski_CrossbeamYawNotPerforming_residual] iso=6081 | ae=12194 | lof=1724 | lstm=213 | hyb=18005 | vote3+=217 | any=14514
‚ùå Skipped Dataset02_Matrix_Rocker4EncoderNotWorking_residual: No residuals found.


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  out = np.where(den > 0, num / den, np.nan)
  df["hybrid_score"] = hybrid
  df.loc[X.index, "hybrid_is_anomaly"] = hlabels.astype(int)
  df.loc[X.index, "hybrid_thr"] = hthr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # NEW: plain voting (>=1

[Dataset03_Wushu_YawTrapezoidNormal_residual] iso=4353 | ae=16691 | lof=2735 | lstm=141 | hyb=32491 | vote3+=363 | any=19849


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  out = np.where(den > 0, num / den, np.nan)
  df["hybrid_score"] = hybrid
  df.loc[X.index, "hybrid_is_anomaly"] = hlabels.astype(int)
  df.loc[X.index, "hybrid_thr"] = hthr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # NEW: plain voting (>=1

[Dataset04_Wushu_YawWaveletSqueak_residual] iso=5912 | ae=5593 | lof=1179 | lstm=722 | hyb=7239 | vote3+=655 | any=7004


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  out = np.where(den > 0, num / den, np.nan)
  df["hybrid_score"] = hybrid
  df.loc[X.index, "hybrid_is_anomaly"] = hlabels.astype(int)
  df.loc[X.index, "hybrid_thr"] = hthr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # NEW: plain voting (>=1

[Dataset05_Wushu_LaneChanges_ModelBump_residual] iso=7951 | ae=7923 | lof=1177 | lstm=335 | hyb=11666 | vote3+=118 | any=11004
‚ùå Skipped Dataset08_Demo_Jiggler_residual: No residuals found.
‚ùå Skipped Dataset09_Demo_VerticalChirp_residual: No residuals found.
‚úÖ Saved row-level: ./Anomaly_detection/code/outputs/combined_anomaly_results.csv
‚úÖ Saved summary:   ./Anomaly_detection/code/outputs/model_comparison_summary.csv
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/model_comparison_plot.png
‚úÖ Saved episodes+reason: ./Anomaly_detection/code/outputs/voted_outputs\voted_anomaly_episodes_with_reasons.csv
‚úÖ Saved episodes+scores: ./Anomaly_detection/code/outputs/voted_outputs\voted_anomaly_episodes_with_reasons_and_scores.csv
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/voted_outputs\voted_plot_Dataset01_Ski_CrossbeamYawNotPerforming_residual.png
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/voted_outputs\voted_plot_Dataset03_Wushu_YawTrapezoidNormal_residual.png
üñºÔ∏è Sav



üìÑ Ops report saved: ./Anomaly_detection/code/outputs/ops_report.pdf


<Figure size 1200x600 with 0 Axes>

<Figure size 1100x600 with 0 Axes>

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Anomaly Detection Product (single script) ‚Äî Extended & Calibrated
- Residual creation (Demand - Measured -> Residual)
- Feature engineering (reuse if already present)
- Scaling once -> shared across models
- Models: IsolationForest, LOF, Dense AE, LSTM AE
- Dynamic thresholds (MAD) for all scores
- Voting (3+) + episodes (merged runs) + vote_any (>=1 model)
- Hybrid scoring (weighted fusion) with MAD or quantile threshold + fallback
- Episode explanations (primary signal, suspected sensor)
- Hardware mapping & root-cause scoring (lag/saturation/drift/vibe)
- Sensor ranking, clustering & heatmap
- Multi-page PDF Ops Report (dynamic first page bars)
- Adds anomaly RATE plot (percent of rows)
- Config-driven (JSON) OR safe defaults (no args)
"""

import os
import json
import argparse
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Torch
import torch
import torch.nn as nn
import torch.optim as optim

# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


# =========================================================
# Utils
# =========================================================
def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def safe_name(name: str) -> str:
    return "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in str(name))


# =========================================================
# Residual creation (optional)
# =========================================================
def create_residuals_for_folder(
    in_folder: str,
    out_folder: str,
    demand_token: str = "Demand",
    measured_token: str = "Measured",
    residual_token: str = "Residual",
    skip_if_exists: bool = True,
    suffix: str = "_residual",
    logger=print,
) -> None:
    ensure_dir(out_folder)
    for file in os.listdir(in_folder):
        if not file.endswith(".csv"):
            continue

        in_path = os.path.join(in_folder, file)
        out_name = file.replace(".csv", f"{suffix}.csv")
        out_path = os.path.join(out_folder, out_name)

        if skip_if_exists and os.path.exists(out_path):
            logger(f"‚Ü©Ô∏è  Skip residual (exists): {out_name}")
            continue

        try:
            df = pd.read_csv(in_path)
        except Exception as e:
            logger(f"‚ùå Failed to read {file}: {e}")
            continue

        cols = df.columns.tolist()
        made_any = False
        for col in cols:
            if demand_token in col:
                measured_col = col.replace(demand_token, measured_token)
                if measured_col in df.columns:
                    residual_col = col.replace(demand_token, residual_token)
                    df[residual_col] = df[col] - df[measured_col]
                    made_any = True

        if not made_any:
            logger(f"‚ö†Ô∏è  No Demand/Measured pairs found in {file}.")
        df.to_csv(out_path, index=False)
        logger(f"‚úÖ Residual CSV saved: {os.path.basename(out_path)}")


# =========================================================
# Scaling + robust threshold (MAD)
# =========================================================
def scale_features(X: pd.DataFrame, use_float32: bool = True):
    """
    Standardize features once and share across models.
    Returns (scaler, X_scaled np.array, X_tensor torch.tensor)
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    if use_float32:
        X_scaled = X_scaled.astype("float32")
    X_tensor = torch.from_numpy(X_scaled)
    return scaler, X_scaled, X_tensor


def robust_threshold(
    values: np.ndarray,
    k: float = 3.5,
    tail: str = "high",
    min_anoms: int = 5,
) -> Tuple[float, np.ndarray]:
    """
    MAD-based threshold: median ¬± k * 1.4826 * MAD
    tail = 'high' (right tail) or 'low' (left tail)
    Returns: (threshold, labels) labels aligned to 'values' (1=anomaly)
    """
    v = np.asarray(values)
    mask = ~np.isnan(v)
    v = v[mask]
    if v.size == 0:
        return (np.inf if tail == "high" else -np.inf), np.zeros_like(values, dtype=int)

    med = np.median(v)
    mad = np.median(np.abs(v - med)) + 1e-12
    if tail == "high":
        thr = med + k * 1.4826 * mad
        labels = (values > thr).astype(int)
    else:
        thr = med - k * 1.4826 * mad
        labels = (values < thr).astype(int)

    # relax if too strict on large arrays
    if labels.sum() < min_anoms and v.size >= 100:
        for k_relax in (3.0, 2.5, 2.0):
            if tail == "high":
                thr = med + k_relax * 1.4826 * mad
                labels = (values > thr).astype(int)
            else:
                thr = med - k_relax * 1.4826 * mad
                labels = (values < thr).astype(int)
            if labels.sum() >= min_anoms:
                break

    return thr, labels


# =========================================================
# Feature Engineering
# =========================================================
def prepare_features(
    df: pd.DataFrame,
    residual_cols: List[str],
    window: int = 5,
    max_features: int = 500,
    logger=print,
) -> Tuple[pd.DataFrame, List[str], Dict[str, int]]:
    """
    Create or reuse features: residual, delta, rolling mean/std
    Returns: X, feature_cols, stats (reused vs generated)
    """
    already_done = any(f"{residual_cols[0]}_delta" in df.columns for _ in residual_cols)
    stats = {"reused": 0, "generated": 0}

    if already_done:
        feature_cols = [
            c for c in df.columns
            if any(k in c for k in ["Residual", "_delta", "_rolling_mean", "_rolling_std"])
        ]
        X = df[feature_cols].dropna()
        stats["reused"] = len(feature_cols)
        logger(f"üîÅ Reusing {len(feature_cols)} engineered features.")
        return X, feature_cols, stats

    # Generate
    for col in residual_cols:
        df[f"{col}_delta"] = df[col].diff()
        df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
        df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()

    feature_cols = []
    for col in residual_cols:
        feature_cols += [
            col,
            f"{col}_delta",
            f"{col}_rolling_mean_{window}",
            f"{col}_rolling_std_{window}",
        ]

    X = df[feature_cols].dropna()
    stats["generated"] = len(feature_cols)
    logger(f"üõ†Ô∏è  Generated {len(feature_cols)} features (window={window}).")

    if X.shape[1] > max_features:
        logger(f"‚ùå Too many features ({X.shape[1]} > {max_features}). Skipping file.")
        return pd.DataFrame(), [], stats

    return X, feature_cols, stats


# =========================================================
# Models
# =========================================================
class Autoencoder(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 32), nn.ReLU(), nn.Linear(32, 8))
        self.decoder = nn.Sequential(nn.Linear(8, 32), nn.ReLU(), nn.Linear(32, input_dim))

    def forward(self, x):
        return self.decoder(self.encoder(x))


def dense_autoencoder_detect(
    X_tensor: torch.Tensor, k: float, ae_epochs: int, ae_lr: float
) -> Tuple[np.ndarray, np.ndarray, float]:
    model = Autoencoder(X_tensor.shape[1])
    opt = optim.Adam(model.parameters(), lr=ae_lr)
    crit = nn.MSELoss()

    for _ in range(ae_epochs):
        opt.zero_grad()
        out = model(X_tensor)
        loss = crit(out, X_tensor)
        loss.backward()
        opt.step()

    with torch.no_grad():
        rec = model(X_tensor)
        errors = torch.mean((X_tensor - rec) ** 2, dim=1).cpu().numpy()

    thr, labels = robust_threshold(errors, k=k, tail="high")
    return labels.astype(int), errors, thr


class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        _, (h, _) = self.encoder(x)  # [1, B, H]
        repeated = h.repeat(x.size(1), 1, 1).transpose(0, 1)  # [B, T, H]
        decoded, _ = self.decoder(repeated)
        return decoded


def make_sequences(X: np.ndarray, seq_len: int) -> Tuple[np.ndarray, List[int]]:
    seqs, idxs = [], []
    for i in range(len(X) - seq_len):
        seqs.append(X[i:i+seq_len])
        idxs.append(i + seq_len - 1)
    return np.array(seqs), idxs


def lstm_autoencoder_detect(
    X_scaled: np.ndarray,
    k: float,
    seq_len: int,
    hidden_dim: int,
    patience: int,
    max_sequences: int,
    downsample: int,
) -> Tuple[np.ndarray, np.ndarray, List[int], float]:
    try:
        Xds = X_scaled[::downsample]
        if len(Xds) < seq_len:
            return np.array([]), np.array([]), [], np.nan

        Xseq, idxs = make_sequences(Xds, seq_len)
        if len(Xseq) > max_sequences:
            Xseq, idxs = Xseq[:max_sequences], idxs[:max_sequences]

        Xt = torch.tensor(Xseq, dtype=torch.float32)
        model = LSTMAutoencoder(Xt.shape[2], hidden_dim)
        opt = optim.Adam(model.parameters(), lr=1e-3)
        crit = nn.MSELoss()

        best, wait = float("inf"), 0
        for _ in range(100):
            model.train()
            opt.zero_grad()
            out = model(Xt)
            loss = crit(out, Xt)
            loss.backward()
            opt.step()
            if loss.item() < best:
                best, wait = loss.item(), 0
            else:
                wait += 1
                if wait >= patience:
                    break

        with torch.no_grad():
            model.eval()
            out = model(Xt)
            errors = torch.mean((Xt - out) ** 2, dim=(1, 2)).cpu().numpy()

        thr, labels = robust_threshold(errors, k=k, tail="high")
        return labels.astype(int), errors, idxs, thr
    except RuntimeError as e:
        print(f"‚ö†Ô∏è LSTM memory error: {e}")
        return np.array([]), np.array([]), [], np.nan


def isolation_forest_detect(X_scaled: np.ndarray, k: float) -> Tuple[np.ndarray, np.ndarray, float]:
    iso = IsolationForest(contamination="auto", n_estimators=300, random_state=42)
    iso.fit(X_scaled)
    scores = -iso.decision_function(X_scaled)  # higher = more anomalous
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


def lof_detect(X_scaled: np.ndarray, k: float, n_neighbors: int) -> Tuple[np.ndarray, np.ndarray, float]:
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination="auto")
    _ = lof.fit_predict(X_scaled)  # populates negative_outlier_factor_
    scores = -lof.negative_outlier_factor_
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


# =========================================================
# Hybrid scoring utilities
# =========================================================
def _robust_z_pos(x: np.ndarray) -> np.ndarray:
    """Right-tail robust z-score (>=0 when above median)."""
    x = np.asarray(x, dtype=float)
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med)) + 1e-12
    z = (x - med) / (1.4826 * mad)
    z = np.where(np.isnan(z), np.nan, z)
    return np.maximum(z, 0.0)  # only right tail counts as anomalous


def _percentile01(x: np.ndarray) -> np.ndarray:
    """Map to [0,1] by robust percentiles (2‚Äì98). Values outside clamp."""
    x = np.asarray(x, dtype=float)
    lo = np.nanpercentile(x, 2)
    hi = np.nanpercentile(x, 98)
    rng = max(hi - lo, 1e-12)
    y = (x - lo) / rng
    return np.clip(y, 0.0, 1.0)


def compute_hybrid_score_on_mask(df: pd.DataFrame, cfg: dict, mask_idx) -> np.ndarray:
    """
    Compute hybrid only on valid rows (mask_idx). Returns array the size of df,
    NaN elsewhere. Requires >= min_components present.
    """
    out = np.full(len(df), np.nan)
    if not cfg.get("hybrid", {}).get("enabled", False):
        return out

    # Weights guard
    wmap = cfg.get("hybrid", {}).get("weights")
    if not isinstance(wmap, dict) or not wmap:
        wmap = {"iso_score": 0.25, "lof_score": 0.25, "ae_error": 0.25, "lstm_error": 0.25}

    method = cfg["hybrid"].get("method", "robust_z")
    min_components = int(cfg["hybrid"].get("min_components", 2))

    use = df.loc[mask_idx]  # restrict to valid feature rows

    comps = [c for c in ["iso_score", "lof_score", "ae_error", "lstm_error"] if c in use.columns and c in wmap]
    if not comps:
        return out

    parts = []
    for c in comps:
        arr = use[c].to_numpy(dtype=float)
        if method == "robust_z":
            norm = _robust_z_pos(arr)
            norm = np.clip(norm, 0, 10.0) / 10.0  # compress extreme tails to ~[0,1]
        else:
            norm = _percentile01(arr)
        parts.append((norm, float(wmap[c])))

    num = np.zeros(len(use), dtype=float)
    den = np.zeros(len(use), dtype=float)
    present = np.zeros(len(use), dtype=int)

    for norm, w in parts:
        m = ~np.isnan(norm)
        num[m] += w * norm[m]
        den[m] += w
        present[m] += 1

    hybrid_local = np.where((den > 0) & (present >= min_components), num / den, np.nan)
    out[np.asarray(mask_idx)] = hybrid_local
    return out


# =========================================================
# Voting, episodes, explanations
# =========================================================
def generate_votes(df: pd.DataFrame) -> pd.DataFrame:
    df["agreement_all_4"] = (
        (df.get("ae_is_anomaly", 0) == 1)
        & (df.get("is_anomaly", 0) == 1)
        & (df.get("lof_is_anomaly", 0) == 1)
        & (df.get("lstm_is_anomaly", 0) == 1)
    ).astype(int)
    df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
    df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
    df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # plain voting (>=1)
    return df


def extract_voted_rows(df: pd.DataFrame, rule: str = "vote_3plus") -> pd.DataFrame:
    if rule == "vote_3plus":
        mask = df["vote_3plus"] == 1
    elif rule == "agreement_all_4":
        mask = df["agreement_all_4"] == 1
    elif rule == "any":
        mask = (
            (df["ae_is_anomaly"] == 1)
            | (df["is_anomaly"] == 1)
            | (df["lof_is_anomaly"] == 1)
            | (df["lstm_is_anomaly"] == 1)
        )
    else:
        raise ValueError(f"Unknown rule: {rule}")
    return df.loc[mask].copy()


def _group_runs(idxs: np.ndarray, min_gap: int = 1) -> List[Tuple[int, int]]:
    if len(idxs) == 0:
        return []
    runs, start, prev = [], int(idxs[0]), int(idxs[0])
    for i in idxs[1:]:
        if int(i) - prev <= min_gap:
            prev = int(i)
            continue
        runs.append((start, prev))
        start = int(i); prev = int(i)
    runs.append((start, prev))
    return runs


def summarize_episodes(voted_df: pd.DataFrame, min_gap: int = 1) -> pd.DataFrame:
    if voted_df.empty:
        return pd.DataFrame(columns=["source_file", "start_idx", "end_idx", "length", "n_models_mean"])

    idxs = voted_df.index.to_numpy()
    runs = _group_runs(idxs, min_gap=min_gap)

    rows = []
    for start, end in runs:
        chunk = voted_df.loc[start:end]
        row = {
            "source_file": chunk["source_file"].iloc[0] if "source_file" in chunk else "",
            "start_idx": start,
            "end_idx": end,
            "length": int(end - start + 1),
            "n_models_mean": float(
                chunk[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1).mean()
            ),
        }
        for c in ["iso_score", "ae_error", "lof_score", "lstm_error", "hybrid_score"]:
            if c in chunk.columns:
                row[f"{c}_max"] = float(chunk[c].max())
                row[f"{c}_mean"] = float(chunk[c].mean())
        rows.append(row)
    return pd.DataFrame(rows)


def _base_residual_columns(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def _models_string(chunk: pd.DataFrame) -> str:
    model_cols = [c for c in ["is_anomaly", "ae_is_anomaly", "lof_is_anomaly", "lstm_is_anomaly", "hybrid_is_anomaly"] if c in chunk.columns]
    if not model_cols:
        return "no-model-flags"
    means = chunk[model_cols].mean()
    active = [m.replace("_is_anomaly", "").upper() for m, v in means.items() if v >= 0.5]
    return ", ".join(active) if active else "weak/isolated flags"


def attach_episode_reasons(
    combined_df: pd.DataFrame, episodes_df: pd.DataFrame, top_k: int = 1
) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df

    base_res = _base_residual_columns(combined_df)
    if not base_res:
        episodes_df["primary_signal"] = ""
        episodes_df["reason"] = "no residual columns present"
        episodes_df["suspected_sensor"] = ""
        return episodes_df

    out = []
    for _, epi in episodes_df.iterrows():
        start, end = int(epi["start_idx"]), int(epi["end_idx"])
        mask = combined_df["source_file"] == epi["source_file"] if "source_file" in combined_df.columns else slice(None)
        chunk = combined_df.loc[mask].loc[start:end]

        if chunk.empty:
            epi["primary_signal"] = ""
            epi["reason"] = "empty slice"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats = []
        for col in base_res:
            if col in chunk.columns:
                stats.append((col, float(chunk[col].abs().max())))
        if not stats:
            epi["primary_signal"] = ""
            epi["reason"] = "no residual stats"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats.sort(key=lambda x: x[1], reverse=True)
        primary_signal, primary_val = stats[:top_k][0]
        models_str = _models_string(chunk)
        measured_col = primary_signal.replace("Residual", "Measured")
        suspected = measured_col if (measured_col in combined_df.columns) else "unknown-measured-sensor"

        epi["primary_signal"] = primary_signal
        epi["reason"] = f"max |{primary_signal}| = {primary_val:.3f}; models: {models_str}"
        epi["suspected_sensor"] = suspected
        out.append(epi)

    return pd.DataFrame(out)


# =========================================================
# Hardware mapping + root cause scoring
# =========================================================
HARDWARE_MAP = [
    ("Force_",         "Actuator/LoadCell",  "Force didn‚Äôt follow demand ‚Üí friction/lag/saturation/load-cell drift likely"),
    ("Encoder_",       "Encoders/Alignment", "Pose/velocity mismatch ‚Üí quantization/missing counts/misalignment"),
    ("Accelerometer_", "IMU/Accelerometer",  "Vibration bursts ‚Üí mounting/looseness/thermal drift"),
    ("State_",         "Control/Timing",     "Requested vs achieved state diverged ‚Üí scheduler limits/controller windup"),
]

def map_signal_to_hardware(primary_signal: str):
    for needle, hw, why in HARDWARE_MAP:
        if needle in primary_signal:
            return hw, why
    return "Unknown", "No mapping rule matched"


def enrich_hardware_mapping(episodes_df: pd.DataFrame) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df
    episodes_df = episodes_df.copy()
    episodes_df["hardware_class"] = ""
    episodes_df["hardware_why"] = ""
    for i, r in episodes_df.iterrows():
        hw, why = map_signal_to_hardware(r.get("primary_signal", ""))
        episodes_df.at[i, "hardware_class"] = hw
        episodes_df.at[i, "hardware_why"]   = why
    return episodes_df


def _paired_columns(primary_signal: str, cfg: dict) -> Tuple[Optional[str], Optional[str]]:
    resid_tok  = cfg["signals"]["residual_token"]
    demand_tok = cfg["signals"]["demand_token"]
    measured_tok = cfg["signals"]["measured_token"]
    if resid_tok not in primary_signal:
        return None, None
    demand_col   = primary_signal.replace(resid_tok, demand_tok)
    measured_col = primary_signal.replace(resid_tok, measured_tok)
    return demand_col, measured_col


def _nan_ok(arr: np.ndarray) -> np.ndarray:
    return np.asarray(arr, dtype=float)


def _cross_correlation_lag(x: np.ndarray, y: np.ndarray, sample_rate_hz: Optional[float]) -> Tuple[float, int]:
    x = _nan_ok(x); y = _nan_ok(y)
    if len(x) != len(y) or len(x) == 0:
        return (np.nan, 0)
    x = x - np.nanmean(x); y = y - np.nanmean(y)
    x = np.nan_to_num(x);  y = np.nan_to_num(y)
    corr = np.correlate(x, y, mode="full")
    lags = np.arange(-len(x)+1, len(x))
    k = int(np.argmax(corr))
    lag_samples = int(lags[k])
    lag_seconds = lag_samples / sample_rate_hz if sample_rate_hz and sample_rate_hz > 0 else np.nan
    return (lag_seconds, lag_samples)


def _saturation_score(demand: np.ndarray, residual: np.ndarray, cfg: dict) -> float:
    if len(demand) == 0 or len(residual) == 0:
        return 0.0
    p_dem = np.nanpercentile(demand, cfg["scores"]["saturation_pct"])
    p_res = np.nanpercentile(np.abs(residual), cfg["scores"]["resid_prominence_pct"])
    near_limit = demand >= p_dem
    large_res  = np.abs(residual) >= p_res
    both = np.logical_and(near_limit, large_res)
    return float(np.nansum(both)) / max(1, len(demand))


def _drift_score(residual: np.ndarray) -> float:
    residual = _nan_ok(residual)
    mu = float(np.nanmean(residual))
    sd = float(np.nanstd(residual)) + 1e-9
    return abs(mu) / sd


def _vibration_score(signal: np.ndarray, sample_rate_hz: Optional[float]) -> float:
    if not sample_rate_hz or sample_rate_hz <= 0 or len(signal) < 8:
        return np.nan
    sig = np.nan_to_num(signal - np.nanmean(signal))
    fft = np.fft.rfft(sig)
    power = np.abs(fft) ** 2
    freqs = np.fft.rfftfreq(len(sig), d=1.0 / sample_rate_hz)
    if len(freqs) == 0:
        return np.nan
    cutoff = 0.25 * (sample_rate_hz / 2.0)  # > Nyquist/4
    mask_hi = freqs >= cutoff
    num = float(np.nansum(power[mask_hi]))
    den = float(np.nansum(power) + 1e-12)
    return num / den


def score_episodes(combined_df: pd.DataFrame, episodes_df: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    """
    Adds: lag_seconds, lag_samples, saturation_score, drift_score, vibe_score
    """
    if episodes_df.empty:
        return episodes_df
    out = episodes_df.copy()
    sr = cfg["signals"]["sample_rate_hz"]
    min_len = cfg["scores"]["min_window_len"]

    if "primary_signal" not in out.columns:
        out["primary_signal"] = ""

    for i, r in out.iterrows():
        start, end = int(r["start_idx"]), int(r["end_idx"])
        if end - start + 1 < min_len:
            out.at[i, "lag_seconds"] = np.nan
            out.at[i, "lag_samples"] = 0
            out.at[i, "saturation_score"] = 0.0
            out.at[i, "drift_score"] = 0.0
            out.at[i, "vibe_score"] = np.nan
            continue

        if "source_file" in combined_df.columns and "source_file" in out.columns and "source_file" in r:
            chunk = combined_df.loc[(combined_df["source_file"] == r["source_file"])].loc[start:end]
        else:
            chunk = combined_df.loc[start:end]

        primary = r.get("primary_signal", "")
        demand_col, measured_col = _paired_columns(primary, cfg)

        resid = chunk[primary].values if (primary in chunk.columns) else np.array([])
        dem   = chunk[demand_col].values if (demand_col and demand_col in chunk.columns) else np.array([])
        meas  = chunk[measured_col].values if (measured_col and measured_col in chunk.columns) else np.array([])

        lag_s, lag_k = _cross_correlation_lag(dem, meas, sr) if (len(dem) and len(meas)) else (np.nan, 0)
        sat_sc = _saturation_score(dem, resid, cfg) if (len(dem) and len(resid)) else 0.0
        dr_sc  = _drift_score(resid) if len(resid) else 0.0
        if "Accelerometer_" in primary and primary in chunk.columns:
            vibe_sc = _vibration_score(chunk[primary].values, sr)
        else:
            vibe_sc = _vibration_score(resid, sr)

        out.at[i, "lag_seconds"]       = lag_s
        out.at[i, "lag_samples"]       = int(lag_k)
        out.at[i, "saturation_score"]  = float(sat_sc)
        out.at[i, "drift_score"]       = float(dr_sc)
        out.at[i, "vibe_score"]        = float(vibe_sc) if vibe_sc == vibe_sc else np.nan
    return out


# =========================================================
# Plotting helpers (per-file voted overlays)
# =========================================================
def _pick_residual(df: pd.DataFrame) -> Optional[str]:
    cand = [c for c in df.columns if "Residual" in c and not any(t in c for t in ["_delta", "_rolling_"])]
    return cand[0] if cand else None


def plot_voted_for_file(
    df_file: pd.DataFrame,
    out_dir: str,
    rule: str,
    min_gap: int,
    figsize: Tuple[int, int] = (12, 5),
) -> Optional[str]:
    ensure_dir(out_dir)
    residual_col = _pick_residual(df_file)
    if residual_col is None:
        print("‚ö†Ô∏è No residual column to plot.")
        return None

    voted_rows = extract_voted_rows(df_file, rule=rule)
    episodes = summarize_episodes(voted_rows, min_gap=min_gap)

    plt.figure(figsize=figsize)
    plt.plot(df_file.index, df_file[residual_col], label=residual_col, alpha=0.85)

    if not voted_rows.empty:
        plt.scatter(voted_rows.index, voted_rows[residual_col], s=12, label=f"Voted anomalies ({rule})")

    if not episodes.empty:
        for _, r in episodes.iterrows():
            plt.axvspan(r["start_idx"], r["end_idx"], alpha=0.15, label="Episode")
        handles, labels = plt.gca().get_legend_handles_labels()
        uniq, seen = [], set()
        for h, l in zip(handles, labels):
            if l not in seen:
                uniq.append((h, l)); seen.add(l)
        handles, labels = zip(*uniq)
        plt.legend(handles, labels)
    else:
        plt.legend()

    sf = df_file["source_file"].iloc[0] if "source_file" in df_file.columns else "file"
    plt.title(f"{sf} ‚Äî Residual with voted anomalies & episodes")
    plt.xlabel("Index"); plt.ylabel(residual_col)
    plt.tight_layout()
    out_path = os.path.join(out_dir, f"voted_plot_{safe_name(sf)}.png")
    plt.savefig(out_path, dpi=160); plt.close()
    return out_path


def plot_all_files(combined_df: pd.DataFrame, out_dir: str, rule: str, min_gap: int, max_files: Optional[int] = None):
    paths = []
    if "source_file" not in combined_df.columns:
        print("‚ö†Ô∏è combined_df missing 'source_file'.")
        return paths
    groups = list(combined_df.groupby("source_file"))
    if max_files is not None:
        groups = groups[:max_files]
    for fname, df_file in groups:
        p = plot_voted_for_file(df_file, out_dir=out_dir, rule=rule, min_gap=min_gap)
        if p:
            paths.append(p); print(f"üñºÔ∏è Saved: {p}")
    if not paths:
        print("‚ö†Ô∏è No plots produced.")
    return paths


# =========================================================
# Sensor attribution, clustering & heatmap
# =========================================================
def _residual_cols_base(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def build_sensor_table(
    combined: pd.DataFrame,
    voted_rows: pd.DataFrame,
    episodes_with_reasons: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    base_res = _residual_cols_base(combined)
    if not base_res:
        return pd.DataFrame()

    total_rows = len(combined)
    voted_mask = pd.Series(False, index=combined.index)
    if not voted_rows.empty:
        voted_mask.loc[voted_rows.index] = True

    rows = []
    expected_keys = [
        "anomaly_rate_is",
        "anomaly_rate_ae",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "anomaly_rate_hybrid",
        "anomaly_rate_vote3p",
        "anomaly_rate_vote_any",
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
        "episodes_as_primary",
    ]

    for col in base_res:
        stats = {"sensor": col}

        # Model rates (per total rows)
        stats["anomaly_rate_is"]   = float(combined["is_anomaly"].sum())   / max(total_rows, 1) if "is_anomaly"   in combined.columns else 0.0
        stats["anomaly_rate_ae"]   = float(combined["ae_is_anomaly"].sum())/ max(total_rows, 1) if "ae_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lof"]  = float(combined["lof_is_anomaly"].sum())/max(total_rows, 1) if "lof_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lstm"] = float(combined["lstm_is_anomaly"].fillna(0).sum())/max(total_rows, 1) if "lstm_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_hybrid"] = float(combined["hybrid_is_anomaly"].sum())/max(total_rows, 1) if "hybrid_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_vote3p"] = float(combined["vote_3plus"].sum())/max(total_rows, 1) if "vote_3plus" in combined.columns else 0.0
        stats["anomaly_rate_vote_any"] = float(combined["vote_any"].sum())/max(total_rows, 1) if "vote_any" in combined.columns else 0.0

        # Mean/Max |residual| during voted anomalies
        if col in combined.columns and voted_mask.any():
            vals = combined.loc[voted_mask, col].abs()
            stats["mean_abs_resid_voted"] = float(vals.mean()) if not vals.empty else 0.0
            stats["max_abs_resid_voted"]  = float(vals.max())  if not vals.empty else 0.0
        else:
            stats["mean_abs_resid_voted"] = 0.0
            stats["max_abs_resid_voted"]  = 0.0

        # Episodes where this residual was primary
        if episodes_with_reasons is not None and not episodes_with_reasons.empty and "primary_signal" in episodes_with_reasons.columns:
            stats["episodes_as_primary"] = int((episodes_with_reasons["primary_signal"] == col).sum())
        else:
            stats["episodes_as_primary"] = 0

        for k in expected_keys:
            stats.setdefault(k, 0.0)

        rows.append(stats)

    sensor_df = pd.DataFrame(rows)
    for c in sensor_df.columns:
        if c != "sensor":
            sensor_df[c] = sensor_df[c].fillna(0.0)
    return sensor_df


def cluster_sensors(
    sensor_df: pd.DataFrame,
    n_clusters: int = 3,
    random_state: int = 42,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    if sensor_df.empty or "sensor" not in sensor_df.columns:
        return sensor_df, np.empty((0, 2)), np.empty((0, 2))

    features = sensor_df.drop(columns=["sensor"]).to_numpy(dtype=np.float32)
    if features.shape[0] < n_clusters:
        n_clusters = max(1, features.shape[0])

    scaler = StandardScaler()
    Z = scaler.fit_transform(features)

    km = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state)
    labels = km.fit_predict(Z)

    pca = PCA(n_components=2, random_state=random_state)
    Z2 = pca.fit_transform(Z)
    centers2 = pca.transform(km.cluster_centers_)

    out = sensor_df.copy()
    out["cluster"] = labels

    return out, Z2, centers2


def plot_sensor_bar_top(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metric: str = "episodes_as_primary",
    top_n: int = 15,
    title: Optional[str] = None,
) -> Optional[str]:
    if sensor_df.empty or metric not in sensor_df.columns:
        return None

    ensure_dir(out_dir)
    df = sensor_df.sort_values(metric, ascending=False).head(top_n)

    plt.figure(figsize=(12, 6))
    plt.bar(range(len(df)), df[metric])
    plt.xticks(range(len(df)), [s.replace("Force_", "F_") for s in df["sensor"]], rotation=60, ha="right")
    plt.ylabel(metric)
    plt.title(title or f"Top {top_n} sensors by {metric}")
    plt.tight_layout()

    path = os.path.join(out_dir, f"top_sensors_{metric}.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_clusters_scatter(
    sensor_df_with_cluster: pd.DataFrame,
    Z2: np.ndarray,
    centers2: np.ndarray,
    out_dir: str,
    title: str = "Sensor clusters (PCA of features)",
) -> Optional[str]:
    if sensor_df_with_cluster.empty or Z2.size == 0:
        return None

    ensure_dir(out_dir)
    plt.figure(figsize=(9, 7))

    clusters = sorted(sensor_df_with_cluster["cluster"].unique().tolist())
    for cl in clusters:
        mask = sensor_df_with_cluster["cluster"] == cl
        pts = Z2[mask.values]
        plt.scatter(pts[:, 0], pts[:, 1], label=f"cluster {cl}", alpha=0.8, s=36)

    if centers2.size:
        plt.scatter(centers2[:, 0], centers2[:, 1], marker="X", s=120, label="centers")

    try:
        top_lab = sensor_df_with_cluster.sort_values("episodes_as_primary", ascending=False).head(10).index
        for idx in top_lab:
            plt.text(Z2[idx, 0], Z2[idx, 1], sensor_df_with_cluster.loc[idx, "sensor"], fontsize=8)
    except Exception:
        pass

    plt.title(title)
    plt.xlabel("PCA-1"); plt.ylabel("PCA-2")
    plt.legend()
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_clusters_pca.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_heatmap(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metrics: Optional[List[str]] = None,
    title: str = "Sensor anomaly fingerprint (rates & magnitudes)",
) -> Optional[str]:
    if sensor_df.empty:
        return None
    ensure_dir(out_dir)

    desired = [
        "anomaly_rate_vote3p",
        "anomaly_rate_vote_any",
        "anomaly_rate_hybrid",
        "anomaly_rate_ae",
        "anomaly_rate_is",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
    ]
    if metrics is None:
        metrics = desired

    available = [m for m in metrics if m in sensor_df.columns]
    if not available:
        print("‚ö†Ô∏è No requested heatmap metrics are present in sensor_df. Skipping heatmap.")
        return None
    if len(available) < len(metrics):
        missing = [m for m in metrics if m not in sensor_df.columns]
        print(f"‚ÑπÔ∏è Skipping missing metrics in heatmap: {missing}")
    metrics = available

    key_rank = "episodes_as_primary" if "episodes_as_primary" in sensor_df.columns else metrics[0]
    keep = sensor_df.sort_values(key_rank, ascending=False).head(25)

    M = keep[metrics].to_numpy(dtype=np.float32)
    plt.figure(figsize=(12, 8))
    plt.imshow(M, aspect="auto")
    plt.colorbar()
    plt.yticks(range(len(keep)), keep["sensor"])
    plt.xticks(range(len(metrics)), metrics, rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_fingerprint_heatmap.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


# =========================================================
# Report
# =========================================================
def _overlay_episode_plot(df: pd.DataFrame, episode_row: pd.Series, cfg: dict, ax=None):
    start, end = int(episode_row["start_idx"]), int(episode_row["end_idx"])
    primary = episode_row.get("primary_signal", "")
    demand_col, measured_col = _paired_columns(primary, cfg)
    if ax is None:
        ax = plt.gca()

    t = np.arange(start, end + 1)
    if primary in df.columns:
        ax.plot(t, df.loc[start:end, primary].values, label=f"{primary}", alpha=0.85)
    if demand_col and demand_col in df.columns:
        ax.plot(t, df.loc[start:end, demand_col].values, label=f"{demand_col}", alpha=0.8)
    if measured_col and measured_col in df.columns:
        ax.plot(t, df.loc[start:end, measured_col].values, label=f"{measured_col}", alpha=0.8)

    ax.set_xlabel("Index")
    ax.set_title(f"Episode {start}‚Äì{end}\nprimary={primary}")
    ax.legend(loc="best")


def build_ops_report(
    combined: pd.DataFrame,
    summary: pd.DataFrame,
    sensor_df: pd.DataFrame,
    episodes_scored: pd.DataFrame,
    cfg: dict,
    out_pdf_path: str
):
    ensure_dir(os.path.dirname(out_pdf_path))
    with PdfPages(out_pdf_path) as pdf:

        # Page 1 ‚Äî Anomalies counts by model (dynamic columns)
        plt.figure(figsize=(11, 6))
        plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus","vote_any"] if c in summary.columns]
        summary[plot_cols].plot(kind="bar")
        plt.title("Anomalies per Model per File")
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        pdf.savefig(); plt.close()

        # Page 2 ‚Äî Top sensors by episodes_as_primary
        p1 = plot_sensor_bar_top(sensor_df, out_dir=cfg["io"]["output_folder"], metric="episodes_as_primary", top_n=15,
                                 title="Top sensors by episodes_as_primary")
        if p1 and os.path.exists(p1):
            img = plt.imread(p1)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Page 3 ‚Äî Sensor heatmap (if created)
        p2 = plot_sensor_heatmap(sensor_df, out_dir=cfg["io"]["output_folder"])
        if p2 and os.path.exists(p2):
            img = plt.imread(p2)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Pages 4+ ‚Äî Example episode overlays
        if not episodes_scored.empty:
            candidates = episodes_scored.copy()
            if "n_models_mean" in candidates.columns:
                candidates = candidates.sort_values(["n_models_mean"], ascending=False)
            n_show = min(cfg["report"]["top_n_episodes"], len(candidates))
            for _, epi in candidates.head(n_show).iterrows():
                plt.figure(figsize=(11, 5))
                if "source_file" in combined.columns and "source_file" in epi:
                    sub = combined.loc[combined["source_file"] == epi["source_file"]]
                else:
                    sub = combined
                _overlay_episode_plot(sub, epi, cfg, ax=plt.gca())
                hw = epi.get("hardware_class", "Unknown")
                why = epi.get("hardware_why", "")
                lag_s = epi.get("lag_seconds", np.nan)
                sat   = epi.get("saturation_score", np.nan)
                drift = epi.get("drift_score", np.nan)
                vibe  = epi.get("vibe_score", np.nan)
                txt = (
                    f"hardware: {hw}\n"
                    f"why: {why}\n"
                    f"lag_seconds: {lag_s:.4f}  |  saturation: {sat:.3f}  |  drift: {drift:.3f}  |  vibe: {vibe:.3f}"
                )
                plt.gcf().text(0.02, 0.02, txt, ha="left", va="bottom", fontsize=9)
                plt.tight_layout()
                pdf.savefig(); plt.close()


# =========================================================
# Config (defaults or JSON)
# =========================================================
def default_config() -> dict:
    return {
        "io": {
            "input_folder": "./Datasets/Datasets",
            "residual_folder": "./Anomaly_detection/residual_created/",
            "output_folder": "./Anomaly_detection/code/outputs/"
        },
        "residuals": {
            "enabled": True,
            "demand_token": "Demand",
            "measured_token": "Measured",
            "residual_token": "Residual",
            "suffix": "_residual"
        },
        "features": {
            "window": 5,
            "max_features": 500
        },
        "threshold": {
            "k": 3.5
        },
        "ae": {
            "epochs": 50,
            "lr": 0.001
        },
        "lstm": {
            "seq_len": 5,
            "hidden_dim": 64,
            "patience": 5,
            "max_sequences": 3000,
            "downsample": 5
        },
        "lof": {
            "n_neighbors": 20
        },
        "hybrid": {  # Hybrid scoring config
            "enabled": True,
            "method": "robust_z",      # "robust_z" | "percentile"
            "min_components": 2,       # require at least N model scores present
            "weights": {               # relative importance (doesn't need to sum to 1)
                "iso_score": 0.20,
                "lof_score": 0.20,
                "ae_error": 0.30,
                "lstm_error": 0.30
            }
        },
        "hybrid_threshold": {          # How to threshold hybrid_score
            "mode": "quantile",        # "mad" or "quantile"
            "k": 3.5,                  # used only if mode="mad"
            "quantile": 0.99           # top 1% as anomalies (fallback if MAD degenerates)
        },
        "voting": {
            "rule": "vote_3plus",    # "vote_3plus" | "agreement_all_4" | "any"
            "min_gap": 1
        },
        "plots": {
            "enabled": True,
            "max_files": None,
            "emit_rate_plot": True     # also write an anomaly RATE bar chart (% rows)
        },
        "runtime": {
            "use_float32": True
        },
        "signals": {
            "sample_rate_hz": 100.0,     # set None if unknown
            "residual_token": "Residual",
            "demand_token": "Demand",
            "measured_token": "Measured"
        },
        "scores": {
            "saturation_pct": 95.0,
            "resid_prominence_pct": 95.0,
            "min_window_len": 5
        },
        "report": {
            "enabled": True,
            "top_n_episodes": 3
        }
    }


def load_config_from_path_or_default(path: Optional[str]) -> dict:
    if path and os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    print("‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.")
    return default_config()


# =========================================================
# Per-file processing & Pipeline
# =========================================================
def process_file(file_path: str, cfg: Dict, logger=print) -> Optional[pd.DataFrame]:
    df = pd.read_csv(file_path)
    file_name = os.path.basename(file_path).replace(".csv", "")

    residual_cols = [c for c in df.columns if "Residual" in c]
    if not residual_cols:
        logger(f"‚ùå Skipped {file_name}: No residuals found.")
        return None

    X, feature_cols, fe_stats = prepare_features(
        df, residual_cols,
        window=cfg["features"]["window"],
        max_features=cfg["features"]["max_features"],
        logger=logger,
    )
    if X is None or len(feature_cols) == 0 or X.empty:
        logger(f"‚ùå Skipped {file_name}: invalid or empty features")
        return None

    _, X_scaled, X_tensor = scale_features(X, use_float32=cfg["runtime"]["use_float32"])

    iso_labels, iso_scores, iso_thr = isolation_forest_detect(X_scaled, k=cfg["threshold"]["k"])
    df.loc[X.index, "is_anomaly"] = iso_labels
    df.loc[X.index, "iso_score"] = iso_scores
    df.loc[X.index, "iso_thr"] = iso_thr

    ae_labels, ae_errors, ae_thr = dense_autoencoder_detect(
        X_tensor, k=cfg["threshold"]["k"], ae_epochs=cfg["ae"]["epochs"], ae_lr=cfg["ae"]["lr"]
    )
    df.loc[X.index, "ae_is_anomaly"] = ae_labels
    df.loc[X.index, "ae_error"] = ae_errors
    df.loc[X.index, "ae_thr"] = ae_thr

    lof_labels, lof_scores, lof_thr = lof_detect(
        X_scaled, k=cfg["threshold"]["k"], n_neighbors=cfg["lof"]["n_neighbors"]
    )
    df.loc[X.index, "lof_is_anomaly"] = lof_labels
    df.loc[X.index, "lof_score"] = lof_scores
    df.loc[X.index, "lof_thr"] = lof_thr

    lstm_labels, lstm_errors, lstm_idx, lstm_thr = lstm_autoencoder_detect(
        X_scaled,
        k=cfg["threshold"]["k"],
        seq_len=cfg["lstm"]["seq_len"],
        hidden_dim=cfg["lstm"]["hidden_dim"],
        patience=cfg["lstm"]["patience"],
        max_sequences=cfg["lstm"]["max_sequences"],
        downsample=cfg["lstm"]["downsample"],
    )
    if len(lstm_idx) > 0:
        df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
        df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
        df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
    else:
        df["lstm_is_anomaly"] = 0
        df["lstm_error"] = np.nan
        df["lstm_thr"] = np.nan

    # --- Hybrid score (weighted fusion on valid rows)
    mask_idx = X.index
    df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)

    hs = df.loc[mask_idx, "hybrid_score"].to_numpy()
    if np.isnan(hs).all():
        df.loc[mask_idx, "hybrid_is_anomaly"] = 0
        df.loc[mask_idx, "hybrid_thr"] = np.nan
    else:
        mode = cfg.get("hybrid_threshold", {}).get("mode", "mad")
        if mode == "quantile":
            q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
            thr = np.nanpercentile(hs, 100 * q)
            labels = (hs > thr).astype(int)
        else:
            thr, labels = robust_threshold(hs, k=cfg["hybrid_threshold"].get("k", 3.5), tail="high")
            # Fallback if too many positives (MAD degenerate)
            if np.nanmean(labels) > 0.5:
                q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
                thr = np.nanpercentile(hs, 100 * q)
                labels = (hs > thr).astype(int)
        df.loc[mask_idx, "hybrid_is_anomaly"] = labels
        df.loc[mask_idx, "hybrid_thr"] = thr

    df = generate_votes(df)  # adds vote_3plus + vote_any
    df["source_file"] = file_name
    df["fe_reused"] = fe_stats.get("reused", 0)
    df["fe_generated"] = fe_stats.get("generated", 0)

    logger(
        f"[{file_name}] iso={int(df['is_anomaly'].sum())} | "
        f"ae={int(df['ae_is_anomaly'].sum())} | "
        f"lof={int(df['lof_is_anomaly'].sum())} | "
        f"lstm={int(df['lstm_is_anomaly'].fillna(0).sum())} | "
        f"hyb={int(df['hybrid_is_anomaly'].sum())} | "
        f"vote3+={int(df['vote_3plus'].sum())} | any={int(df['vote_any'].sum())}"
    )
    return df


def run_pipeline(cfg: Dict):
    logger = print

    # A) residuals (optional)
    if cfg["residuals"]["enabled"]:
        logger("üîß Creating residuals...")
        create_residuals_for_folder(
            in_folder=cfg["io"]["input_folder"],
            out_folder=cfg["io"]["residual_folder"],
            demand_token=cfg["residuals"]["demand_token"],
            measured_token=cfg["residuals"]["measured_token"],
            residual_token=cfg["residuals"]["residual_token"],
            skip_if_exists=True,
            suffix=cfg["residuals"]["suffix"],
            logger=logger,
        )
        data_folder = cfg["io"]["residual_folder"]
    else:
        data_folder = cfg["io"]["input_folder"]

    # B) per-file
    all_dfs = []
    for file in os.listdir(data_folder):
        if file.endswith(".csv"):
            out = process_file(os.path.join(data_folder, file), cfg, logger=logger)
            if out is not None:
                all_dfs.append(out)

    if not all_dfs:
        logger("‚ùå No files processed.")
        return

    combined = pd.concat(all_dfs, ignore_index=True)
    ensure_dir(cfg["io"]["output_folder"])

    combined_path = os.path.join(cfg["io"]["output_folder"], "combined_anomaly_results.csv")
    combined.to_csv(combined_path, index=False)

    # Summary (counts)
    cols = ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus","vote_any"]
    cols = [c for c in cols if c in combined.columns]
    summary = combined.groupby("source_file")[cols].sum()
    summary["total_anomalies"] = summary.sum(axis=1)
    summary_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_summary.csv")
    summary.to_csv(summary_path)

    logger(f"‚úÖ Saved row-level: {combined_path}")
    logger(f"‚úÖ Saved summary:   {summary_path}")

    # C) Counts plot (dynamic cols incl. Hybrid + Plain voting)
    plt.figure(figsize=(12, 6))
    plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus","vote_any"] if c in summary.columns]
    summary[plot_cols].plot(kind="bar", figsize=(12, 6))
    plt.title("Anomalies per Model per File")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    bar_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_plot.png")
    plt.savefig(bar_path); plt.close()
    logger(f"üñºÔ∏è Saved: {bar_path}")

    # C2) Rate plot (percent of rows) for apples-to-apples comparison
    if cfg.get("plots", {}).get("emit_rate_plot", True):
        sizes = combined.groupby("source_file").size().rename("n_rows")
        summary_rates = summary.div(sizes, axis=0) * 100.0
        plt.figure(figsize=(12, 6))
        rate_cols = [c for c in plot_cols if c in summary_rates.columns]
        summary_rates[rate_cols].plot(kind="bar", figsize=(12, 6))
        plt.title("Anomaly RATE per Model per File (%)")
        plt.ylabel("Percent of rows (%)")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        rate_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_rate_plot.png")
        plt.savefig(rate_path); plt.close()
        logger(f"üñºÔ∏è Saved: {rate_path}")

    # D) Voted rows + episodes + reasons
    voted_rows = extract_voted_rows(combined, rule=cfg["voting"]["rule"])
    voted_dir = os.path.join(cfg["io"]["output_folder"], "voted_outputs")
    ensure_dir(voted_dir)
    voted_rows_path = os.path.join(voted_dir, "voted_anomalies_rows.csv")
    voted_rows.to_csv(voted_rows_path, index=False)

    episodes = summarize_episodes(voted_rows, min_gap=cfg["voting"]["min_gap"])
    episodes_path = os.path.join(voted_dir, "voted_anomaly_episodes.csv")
    episodes.to_csv(episodes_path, index=False)

    episodes_with_reasons = attach_episode_reasons(combined, episodes, top_k=1)
    episodes_with_reasons = enrich_hardware_mapping(episodes_with_reasons)
    episodes_scored = score_episodes(combined, episodes_with_reasons, cfg)

    episodes_reason_path = os.path.join(voted_dir, "voted_anomaly_episodes_with_reasons.csv")
    episodes_scored_path = os.path.join(voted_dir, "voted_anomaly_episodes_with_reasons_and_scores.csv")
    episodes_with_reasons.to_csv(episodes_reason_path, index=False)
    episodes_scored.to_csv(episodes_scored_path, index=False)
    logger(f"‚úÖ Saved episodes+reason: {episodes_reason_path}")
    logger(f"‚úÖ Saved episodes+scores: {episodes_scored_path}")

    # E) Per-file plots with voted overlays (optional)
    if cfg["plots"]["enabled"]:
        _ = plot_all_files(
            combined_df=combined,
            out_dir=voted_dir,
            rule=cfg["voting"]["rule"],
            min_gap=cfg["voting"]["min_gap"],
            max_files=cfg["plots"]["max_files"],
        )

    # F) Sensor table + clustering visuals
    sensor_df = build_sensor_table(combined, voted_rows, episodes_with_reasons=episodes_with_reasons)
    sensor_df_path = os.path.join(voted_dir, "sensor_table.csv")
    sensor_df.to_csv(sensor_df_path, index=False)
    logger(f"‚úÖ Saved sensor table: {sensor_df_path}")

    clustered, Z2, centers2 = cluster_sensors(sensor_df, n_clusters=3, random_state=42)
    _ = plot_sensor_clusters_scatter(clustered, Z2, centers2, out_dir=voted_dir)
    _ = plot_sensor_heatmap(sensor_df, out_dir=voted_dir)
    _ = plot_sensor_bar_top(sensor_df, out_dir=voted_dir, metric="episodes_as_primary", top_n=15)

    # G) PDF report
    if cfg.get("report", {}).get("enabled", True):
        pdf_path = os.path.join(cfg["io"]["output_folder"], "ops_report.pdf")
        build_ops_report(
            combined=combined,
            summary=summary,
            sensor_df=sensor_df,
            episodes_scored=episodes_scored,
            cfg=cfg,
            out_pdf_path=pdf_path
        )
        logger(f"üìÑ Ops report saved: {pdf_path}")


# =========================================================
# Entrypoint
# =========================================================
def main():
    parser = argparse.ArgumentParser(description="Anomaly Detection Product")
    parser.add_argument("--config", type=str, default=None, help="Path to config JSON")
    args, _ = parser.parse_known_args()  # allows notebook execution

    cfg = load_config_from_path_or_default(args.config)
    run_pipeline(cfg)


if __name__ == "__main__":
    main()


‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.
üîß Creating residuals...
‚Ü©Ô∏è  Skip residual (exists): Dataset01_Ski_CrossbeamYawNotPerforming_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset02_Matrix_Rocker4EncoderNotWorking_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset03_Wushu_YawTrapezoidNormal_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset04_Wushu_YawWaveletSqueak_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset05_Wushu_LaneChanges_ModelBump_residual.csv
‚ùå Failed to read Dataset07_Demo_Spa_GT.csv: No columns to parse from file
‚Ü©Ô∏è  Skip residual (exists): Dataset08_Demo_Jiggler_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset09_Demo_VerticalChirp_residual.csv
‚ùå Failed to read Dataset10_Demo_MillbrookHills.csv: No columns to parse from file


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # plain voting (>=1)
  df["source_file"]

[Dataset01_Ski_CrossbeamYawNotPerforming_residual] iso=6081 | ae=12267 | lof=1724 | lstm=221 | hyb=1802 | vote3+=216 | any=14816
‚ùå Skipped Dataset02_Matrix_Rocker4EncoderNotWorking_residual: No residuals found.


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # plain voting (>=1)
  df["source_file"]

[Dataset03_Wushu_YawTrapezoidNormal_residual] iso=4353 | ae=16319 | lof=2735 | lstm=138 | hyb=3815 | vote3+=338 | any=20169


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # plain voting (>=1)
  df["source_file"]

[Dataset04_Wushu_YawWaveletSqueak_residual] iso=5912 | ae=5620 | lof=1179 | lstm=728 | hyb=190 | vote3+=662 | any=7009


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["vote_any"] = (df["num_votes"] >= 1).astype(int)  # plain voting (>=1)
  df["source_file"]

[Dataset05_Wushu_LaneChanges_ModelBump_residual] iso=7951 | ae=8558 | lof=1177 | lstm=334 | hyb=1312 | vote3+=118 | any=11125
‚ùå Skipped Dataset08_Demo_Jiggler_residual: No residuals found.
‚ùå Skipped Dataset09_Demo_VerticalChirp_residual: No residuals found.
‚úÖ Saved row-level: ./Anomaly_detection/code/outputs/combined_anomaly_results.csv
‚úÖ Saved summary:   ./Anomaly_detection/code/outputs/model_comparison_summary.csv
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/model_comparison_plot.png
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/model_comparison_rate_plot.png
‚úÖ Saved episodes+reason: ./Anomaly_detection/code/outputs/voted_outputs\voted_anomaly_episodes_with_reasons.csv
‚úÖ Saved episodes+scores: ./Anomaly_detection/code/outputs/voted_outputs\voted_anomaly_episodes_with_reasons_and_scores.csv
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/voted_outputs\voted_plot_Dataset01_Ski_CrossbeamYawNotPerforming_residual.png
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/voted_



üìÑ Ops report saved: ./Anomaly_detection/code/outputs/ops_report.pdf


<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1100x600 with 0 Axes>

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Anomaly Detection Product (single script) ‚Äî Hybrid + Episode fix (no vote_any)
- Residual creation (Demand - Measured -> Residual)
- Feature engineering (reuse if already present)
- Scaling once -> shared across models
- Models: IsolationForest, LOF, Dense AE, LSTM AE
- Dynamic thresholds (MAD); Hybrid: MAD or quantile with fallback
- Voting (3+) + episodes (merged runs, grouped per file ‚úÖ)
- Robust overlay plots with context padding (fixes blank episode pages ‚úÖ)
- Episode explanations + hardware mapping + root-cause scoring
- Sensor ranking, clustering & heatmap
- Multi-page PDF Ops Report (dynamic first page)
- Emits a RATE plot (% rows) for fair cross-file comparison
"""

import os
import json
import argparse
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Torch
import torch
import torch.nn as nn
import torch.optim as optim

# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


# =========================================================
# Utils
# =========================================================
def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def safe_name(name: str) -> str:
    return "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in str(name))


# =========================================================
# Residual creation (optional)
# =========================================================
def create_residuals_for_folder(
    in_folder: str,
    out_folder: str,
    demand_token: str = "Demand",
    measured_token: str = "Measured",
    residual_token: str = "Residual",
    skip_if_exists: bool = True,
    suffix: str = "_residual",
    logger=print,
) -> None:
    ensure_dir(out_folder)
    for file in os.listdir(in_folder):
        if not file.endswith(".csv"):
            continue

        in_path = os.path.join(in_folder, file)
        out_name = file.replace(".csv", f"{suffix}.csv")
        out_path = os.path.join(out_folder, out_name)

        if skip_if_exists and os.path.exists(out_path):
            logger(f"‚Ü©Ô∏è  Skip residual (exists): {out_name}")
            continue

        try:
            df = pd.read_csv(in_path)
        except Exception as e:
            logger(f"‚ùå Failed to read {file}: {e}")
            continue

        cols = df.columns.tolist()
        made_any = False
        for col in cols:
            if demand_token in col:
                measured_col = col.replace(demand_token, measured_token)
                if measured_col in df.columns:
                    residual_col = col.replace(demand_token, residual_token)
                    df[residual_col] = df[col] - df[measured_col]
                    made_any = True

        if not made_any:
            logger(f"‚ö†Ô∏è  No Demand/Measured pairs found in {file}.")
        df.to_csv(out_path, index=False)
        logger(f"‚úÖ Residual CSV saved: {os.path.basename(out_path)}")


# =========================================================
# Scaling + robust threshold (MAD)
# =========================================================
def scale_features(X: pd.DataFrame, use_float32: bool = True):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    if use_float32:
        X_scaled = X_scaled.astype("float32")
    X_tensor = torch.from_numpy(X_scaled)
    return scaler, X_scaled, X_tensor


def robust_threshold(
    values: np.ndarray,
    k: float = 3.5,
    tail: str = "high",
    min_anoms: int = 5,
) -> Tuple[float, np.ndarray]:
    v = np.asarray(values)
    mask = ~np.isnan(v)
    v = v[mask]
    if v.size == 0:
        return (np.inf if tail == "high" else -np.inf), np.zeros_like(values, dtype=int)

    med = np.median(v)
    mad = np.median(np.abs(v - med)) + 1e-12
    if tail == "high":
        thr = med + k * 1.4826 * mad
        labels = (values > thr).astype(int)
    else:
        thr = med - k * 1.4826 * mad
        labels = (values < thr).astype(int)

    # relax if too strict on large arrays
    if labels.sum() < min_anoms and v.size >= 100:
        for k_relax in (3.0, 2.5, 2.0):
            if tail == "high":
                thr = med + k_relax * 1.4826 * mad
                labels = (values > thr).astype(int)
            else:
                thr = med - k_relax * 1.4826 * mad
                labels = (values < thr).astype(int)
            if labels.sum() >= min_anoms:
                break

    return thr, labels


# =========================================================
# Feature Engineering
# =========================================================
def prepare_features(
    df: pd.DataFrame,
    residual_cols: List[str],
    window: int = 5,
    max_features: int = 500,
    logger=print,
) -> Tuple[pd.DataFrame, List[str], Dict[str, int]]:
    already_done = any(f"{residual_cols[0]}_delta" in df.columns for _ in residual_cols)
    stats = {"reused": 0, "generated": 0}

    if already_done:
        feature_cols = [
            c for c in df.columns
            if any(k in c for k in ["Residual", "_delta", "_rolling_mean", "_rolling_std"])
        ]
        X = df[feature_cols].dropna()
        stats["reused"] = len(feature_cols)
        logger(f"üîÅ Reusing {len(feature_cols)} engineered features.")
        return X, feature_cols, stats

    # Generate
    for col in residual_cols:
        df[f"{col}_delta"] = df[col].diff()
        df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
        df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()

    feature_cols = []
    for col in residual_cols:
        feature_cols += [
            col,
            f"{col}_delta",
            f"{col}_rolling_mean_{window}",
            f"{col}_rolling_std_{window}",
        ]

    X = df[feature_cols].dropna()
    stats["generated"] = len(feature_cols)
    logger(f"üõ†Ô∏è  Generated {len(feature_cols)} features (window={window}).")

    if X.shape[1] > max_features:
        logger(f"‚ùå Too many features ({X.shape[1]} > {max_features}). Skipping file.")
        return pd.DataFrame(), [], stats

    return X, feature_cols, stats


# =========================================================
# Models
# =========================================================
class Autoencoder(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 32), nn.ReLU(), nn.Linear(32, 8))
        self.decoder = nn.Sequential(nn.Linear(8, 32), nn.ReLU(), nn.Linear(32, input_dim))

    def forward(self, x):
        return self.decoder(self.encoder(x))


def dense_autoencoder_detect(
    X_tensor: torch.Tensor, k: float, ae_epochs: int, ae_lr: float
) -> Tuple[np.ndarray, np.ndarray, float]:
    model = Autoencoder(X_tensor.shape[1])
    opt = optim.Adam(model.parameters(), lr=ae_lr)
    crit = nn.MSELoss()

    for _ in range(ae_epochs):
        opt.zero_grad()
        out = model(X_tensor)
        loss = crit(out, X_tensor)
        loss.backward()
        opt.step()

    with torch.no_grad():
        rec = model(X_tensor)
        errors = torch.mean((X_tensor - rec) ** 2, dim=1).cpu().numpy()

    thr, labels = robust_threshold(errors, k=k, tail="high")
    return labels.astype(int), errors, thr


class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        _, (h, _) = self.encoder(x)  # [1, B, H]
        repeated = h.repeat(x.size(1), 1, 1).transpose(0, 1)  # [B, T, H]
        decoded, _ = self.decoder(repeated)
        return decoded


def make_sequences(X: np.ndarray, seq_len: int) -> Tuple[np.ndarray, List[int]]:
    seqs, idxs = [], []
    for i in range(len(X) - seq_len):
        seqs.append(X[i:i+seq_len])
        idxs.append(i + seq_len - 1)
    return np.array(seqs), idxs


def lstm_autoencoder_detect(
    X_scaled: np.ndarray,
    k: float,
    seq_len: int,
    hidden_dim: int,
    patience: int,
    max_sequences: int,
    downsample: int,
) -> Tuple[np.ndarray, np.ndarray, List[int], float]:
    try:
        Xds = X_scaled[::downsample]
        if len(Xds) < seq_len:
            return np.array([]), np.array([]), [], np.nan

        Xseq, idxs = make_sequences(Xds, seq_len)
        if len(Xseq) > max_sequences:
            Xseq, idxs = Xseq[:max_sequences], idxs[:max_sequences]

        Xt = torch.tensor(Xseq, dtype=torch.float32)
        model = LSTMAutoencoder(Xt.shape[2], hidden_dim)
        opt = optim.Adam(model.parameters(), lr=1e-3)
        crit = nn.MSELoss()

        best, wait = float("inf"), 0
        for _ in range(100):
            model.train()
            opt.zero_grad()
            out = model(Xt)
            loss = crit(out, Xt)
            loss.backward()
            opt.step()
            if loss.item() < best:
                best, wait = loss.item(), 0
            else:
                wait += 1
                if wait >= patience:
                    break

        with torch.no_grad():
            model.eval()
            out = model(Xt)
            errors = torch.mean((Xt - out) ** 2, dim=(1, 2)).cpu().numpy()

        thr, labels = robust_threshold(errors, k=k, tail="high")
        return labels.astype(int), errors, idxs, thr
    except RuntimeError as e:
        print(f"‚ö†Ô∏è LSTM memory error: {e}")
        return np.array([]), np.array([]), [], np.nan


def isolation_forest_detect(X_scaled: np.ndarray, k: float) -> Tuple[np.ndarray, np.ndarray, float]:
    iso = IsolationForest(contamination="auto", n_estimators=300, random_state=42)
    iso.fit(X_scaled)
    scores = -iso.decision_function(X_scaled)  # higher = more anomalous
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


def lof_detect(X_scaled: np.ndarray, k: float, n_neighbors: int) -> Tuple[np.ndarray, np.ndarray, float]:
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination="auto")
    _ = lof.fit_predict(X_scaled)  # populates negative_outlier_factor_
    scores = -lof.negative_outlier_factor_
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


# =========================================================
# Hybrid scoring utilities
# =========================================================
def _robust_z_pos(x: np.ndarray) -> np.ndarray:
    """Right-tail robust z-score (>=0 when above median)."""
    x = np.asarray(x, dtype=float)
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med)) + 1e-12
    z = (x - med) / (1.4826 * mad)
    z = np.where(np.isnan(z), np.nan, z)
    return np.maximum(z, 0.0)  # only right tail counts as anomalous


def _percentile01(x: np.ndarray) -> np.ndarray:
    """Map to [0,1] by robust percentiles (2‚Äì98). Values outside clamp."""
    x = np.asarray(x, dtype=float)
    lo = np.nanpercentile(x, 2)
    hi = np.nanpercentile(x, 98)
    rng = max(hi - lo, 1e-12)
    y = (x - lo) / rng
    return np.clip(y, 0.0, 1.0)


def compute_hybrid_score_on_mask(df: pd.DataFrame, cfg: dict, mask_idx) -> np.ndarray:
    """
    Compute hybrid only on valid rows (mask_idx). Returns array the size of df,
    NaN elsewhere. Requires >= min_components present.
    """
    out = np.full(len(df), np.nan)
    if not cfg.get("hybrid", {}).get("enabled", False):
        return out

    # Weights guard
    wmap = cfg.get("hybrid", {}).get("weights")
    if not isinstance(wmap, dict) or not wmap:
        wmap = {"iso_score": 0.25, "lof_score": 0.25, "ae_error": 0.25, "lstm_error": 0.25}

    method = cfg["hybrid"].get("method", "robust_z")
    min_components = int(cfg["hybrid"].get("min_components", 2))

    use = df.loc[mask_idx]  # restrict to valid feature rows

    comps = [c for c in ["iso_score", "lof_score", "ae_error", "lstm_error"] if c in use.columns and c in wmap]
    if not comps:
        return out

    parts = []
    for c in comps:
        arr = use[c].to_numpy(dtype=float)
        if method == "robust_z":
            norm = _robust_z_pos(arr)
            norm = np.clip(norm, 0, 10.0) / 10.0  # compress extreme tails to ~[0,1]
        else:
            norm = _percentile01(arr)
        parts.append((norm, float(wmap[c])))

    num = np.zeros(len(use), dtype=float)
    den = np.zeros(len(use), dtype=float)
    present = np.zeros(len(use), dtype=int)

    for norm, w in parts:
        m = ~np.isnan(norm)
        num[m] += w * norm[m]
        den[m] += w
        present[m] += 1

    hybrid_local = np.where((den > 0) & (present >= min_components), num / den, np.nan)
    out[np.asarray(mask_idx)] = hybrid_local
    return out


# =========================================================
# Voting, episodes, explanations
# =========================================================
def generate_votes(df: pd.DataFrame) -> pd.DataFrame:
    df["agreement_all_4"] = (
        (df.get("ae_is_anomaly", 0) == 1)
        & (df.get("is_anomaly", 0) == 1)
        & (df.get("lof_is_anomaly", 0) == 1)
        & (df.get("lstm_is_anomaly", 0) == 1)
    ).astype(int)
    df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
    df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
    return df  # NOTE: vote_any removed


def extract_voted_rows(df: pd.DataFrame, rule: str = "vote_3plus") -> pd.DataFrame:
    if rule == "vote_3plus":
        mask = df["vote_3plus"] == 1
    elif rule == "agreement_all_4":
        mask = df["agreement_all_4"] == 1
    elif rule == "any":
        mask = (
            (df["ae_is_anomaly"] == 1)
            | (df["is_anomaly"] == 1)
            | (df["lof_is_anomaly"] == 1)
            | (df["lstm_is_anomaly"] == 1)
        )
    else:
        raise ValueError(f"Unknown rule: {rule}")
    return df.loc[mask].copy()


def _group_runs(idxs: np.ndarray, min_gap: int = 1) -> List[Tuple[int, int]]:
    if len(idxs) == 0:
        return []
    runs, start, prev = [], int(idxs[0]), int(idxs[0])
    for i in idxs[1:]:
        if int(i) - prev <= min_gap:
            prev = int(i)
            continue
        runs.append((start, prev))
        start = int(i); prev = int(i)
    runs.append((start, prev))
    return runs


def summarize_episodes(voted_df: pd.DataFrame, min_gap: int = 1) -> pd.DataFrame:
    """
    Build episodes PER FILE (avoids cross-file index slices).
    """
    if voted_df.empty:
        cols = ["source_file", "start_idx", "end_idx", "length", "n_models_mean"]
        return pd.DataFrame(columns=cols)

    rows = []
    if "source_file" in voted_df.columns:
        groups = voted_df.groupby("source_file")
    else:
        groups = [("", voted_df)]

    for sf, g in groups:
        idxs = g.index.to_numpy()
        if idxs.size == 0:
            continue
        runs = _group_runs(idxs, min_gap=min_gap)
        for start, end in runs:
            chunk = g.loc[start:end]
            row = {
                "source_file": sf,
                "start_idx": int(start),
                "end_idx": int(end),
                "length": int(end - start + 1),
                "n_models_mean": float(
                    chunk[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1).mean()
                ),
            }
            for c in ["iso_score", "ae_error", "lof_score", "lstm_error", "hybrid_score"]:
                if c in chunk.columns:
                    row[f"{c}_max"] = float(chunk[c].max())
                    row[f"{c}_mean"] = float(chunk[c].mean())
            rows.append(row)

    return pd.DataFrame(rows)


def _base_residual_columns(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def _models_string(chunk: pd.DataFrame) -> str:
    model_cols = [c for c in ["is_anomaly", "ae_is_anomaly", "lof_is_anomaly", "lstm_is_anomaly", "hybrid_is_anomaly"] if c in chunk.columns]
    if not model_cols:
        return "no-model-flags"
    means = chunk[model_cols].mean()
    active = [m.replace("_is_anomaly", "").upper() for m, v in means.items() if v >= 0.5]
    return ", ".join(active) if active else "weak/isolated flags"


def attach_episode_reasons(
    combined_df: pd.DataFrame, episodes_df: pd.DataFrame, top_k: int = 1
) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df

    base_res = _base_residual_columns(combined_df)
    if not base_res:
        episodes_df["primary_signal"] = ""
        episodes_df["reason"] = "no residual columns present"
        episodes_df["suspected_sensor"] = ""
        return episodes_df

    out = []
    for _, epi in episodes_df.iterrows():
        start, end = int(epi["start_idx"]), int(epi["end_idx"])
        mask = combined_df["source_file"] == epi["source_file"] if "source_file" in combined_df.columns else slice(None)
        chunk = combined_df.loc[mask].loc[start:end]

        if chunk.empty:
            epi["primary_signal"] = ""
            epi["reason"] = "empty slice"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats = []
        for col in base_res:
            if col in chunk.columns:
                stats.append((col, float(chunk[col].abs().max())))
        if not stats:
            epi["primary_signal"] = ""
            epi["reason"] = "no residual stats"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats.sort(key=lambda x: x[1], reverse=True)
        primary_signal, primary_val = stats[:top_k][0]
        models_str = _models_string(chunk)
        measured_col = primary_signal.replace("Residual", "Measured")
        suspected = measured_col if (measured_col in combined_df.columns) else "unknown-measured-sensor"

        epi["primary_signal"] = primary_signal
        epi["reason"] = f"max |{primary_signal}| = {primary_val:.3f}; models: {models_str}"
        epi["suspected_sensor"] = suspected
        out.append(epi)

    return pd.DataFrame(out)


# =========================================================
# Hardware mapping + root cause scoring
# =========================================================
HARDWARE_MAP = [
    ("Force_",         "Actuator/LoadCell",  "Force didn‚Äôt follow demand ‚Üí friction/lag/saturation/load-cell drift likely"),
    ("Encoder_",       "Encoders/Alignment", "Pose/velocity mismatch ‚Üí quantization/missing counts/misalignment"),
    ("Accelerometer_", "IMU/Accelerometer",  "Vibration bursts ‚Üí mounting/looseness/thermal drift"),
    ("State_",         "Control/Timing",     "Requested vs achieved state diverged ‚Üí scheduler limits/controller windup"),
]

def map_signal_to_hardware(primary_signal: str):
    for needle, hw, why in HARDWARE_MAP:
        if needle in primary_signal:
            return hw, why
    return "Unknown", "No mapping rule matched"


def enrich_hardware_mapping(episodes_df: pd.DataFrame) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df
    episodes_df = episodes_df.copy()
    episodes_df["hardware_class"] = ""
    episodes_df["hardware_why"] = ""
    for i, r in episodes_df.iterrows():
        hw, why = map_signal_to_hardware(r.get("primary_signal", ""))
        episodes_df.at[i, "hardware_class"] = hw
        episodes_df.at[i, "hardware_why"]   = why
    return episodes_df


def _paired_columns(primary_signal: str, cfg: dict) -> Tuple[Optional[str], Optional[str]]:
    resid_tok  = cfg["signals"]["residual_token"]
    demand_tok = cfg["signals"]["demand_token"]
    measured_tok = cfg["signals"]["measured_token"]
    if resid_tok not in primary_signal:
        return None, None
    demand_col   = primary_signal.replace(resid_tok, demand_tok)
    measured_col = primary_signal.replace(resid_tok, measured_tok)
    return demand_col, measured_col


def _nan_ok(arr: np.ndarray) -> np.ndarray:
    return np.asarray(arr, dtype=float)


def _cross_correlation_lag(x: np.ndarray, y: np.ndarray, sample_rate_hz: Optional[float]) -> Tuple[float, int]:
    x = _nan_ok(x); y = _nan_ok(y)
    if len(x) != len(y) or len(x) == 0:
        return (np.nan, 0)
    x = x - np.nanmean(x); y = y - np.nanmean(y)
    x = np.nan_to_num(x);  y = np.nan_to_num(y)
    corr = np.correlate(x, y, mode="full")
    lags = np.arange(-len(x)+1, len(x))
    k = int(np.argmax(corr))
    lag_samples = int(lags[k])
    lag_seconds = lag_samples / sample_rate_hz if sample_rate_hz and sample_rate_hz > 0 else np.nan
    return (lag_seconds, lag_samples)


def _saturation_score(demand: np.ndarray, residual: np.ndarray, cfg: dict) -> float:
    if len(demand) == 0 or len(residual) == 0:
        return 0.0
    p_dem = np.nanpercentile(demand, cfg["scores"]["saturation_pct"])
    p_res = np.nanpercentile(np.abs(residual), cfg["scores"]["resid_prominence_pct"])
    near_limit = demand >= p_dem
    large_res  = np.abs(residual) >= p_res
    both = np.logical_and(near_limit, large_res)
    return float(np.nansum(both)) / max(1, len(demand))


def _drift_score(residual: np.ndarray) -> float:
    residual = _nan_ok(residual)
    mu = float(np.nanmean(residual))
    sd = float(np.nanstd(residual)) + 1e-9
    return abs(mu) / sd


def _vibration_score(signal: np.ndarray, sample_rate_hz: Optional[float]) -> float:
    if not sample_rate_hz or sample_rate_hz <= 0 or len(signal) < 8:
        return np.nan
    sig = np.nan_to_num(signal - np.nanmean(signal))
    fft = np.fft.rfft(sig)
    power = np.abs(fft) ** 2
    freqs = np.fft.rfftfreq(len(sig), d=1.0 / sample_rate_hz)
    if len(freqs) == 0:
        return np.nan
    cutoff = 0.25 * (sample_rate_hz / 2.0)
    mask_hi = freqs >= cutoff
    num = float(np.nansum(power[mask_hi]))
    den = float(np.nansum(power) + 1e-12)
    return num / den


def score_episodes(combined_df: pd.DataFrame, episodes_df: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df
    out = episodes_df.copy()
    sr = cfg["signals"]["sample_rate_hz"]
    min_len = cfg["scores"]["min_window_len"]

    if "primary_signal" not in out.columns:
        out["primary_signal"] = ""

    for i, r in out.iterrows():
        start, end = int(r["start_idx"]), int(r["end_idx"])
        if end - start + 1 < min_len:
            out.at[i, "lag_seconds"] = np.nan
            out.at[i, "lag_samples"] = 0
            out.at[i, "saturation_score"] = 0.0
            out.at[i, "drift_score"] = 0.0
            out.at[i, "vibe_score"] = np.nan
            continue

        if "source_file" in combined_df.columns and "source_file" in out.columns and "source_file" in r:
            chunk = combined_df.loc[(combined_df["source_file"] == r["source_file"])].loc[start:end]
        else:
            chunk = combined_df.loc[start:end]

        primary = r.get("primary_signal", "")
        demand_col, measured_col = _paired_columns(primary, cfg)

        resid = chunk[primary].values if (primary in chunk.columns) else np.array([])
        dem   = chunk[demand_col].values if (demand_col and demand_col in chunk.columns) else np.array([])
        meas  = chunk[measured_col].values if (measured_col and measured_col in chunk.columns) else np.array([])

        lag_s, lag_k = _cross_correlation_lag(dem, meas, sr) if (len(dem) and len(meas)) else (np.nan, 0)
        sat_sc = _saturation_score(dem, resid, cfg) if (len(dem) and len(resid)) else 0.0
        dr_sc  = _drift_score(resid) if len(resid) else 0.0
        if "Accelerometer_" in primary and primary in chunk.columns:
            vibe_sc = _vibration_score(chunk[primary].values, sr)
        else:
            vibe_sc = _vibration_score(resid, sr)

        out.at[i, "lag_seconds"]       = lag_s
        out.at[i, "lag_samples"]       = int(lag_k)
        out.at[i, "saturation_score"]  = float(sat_sc)
        out.at[i, "drift_score"]       = float(dr_sc)
        out.at[i, "vibe_score"]        = float(vibe_sc) if vibe_sc == vibe_sc else np.nan
    return out


# =========================================================
# Plotting helpers (per-file voted overlays)
# =========================================================
def _pick_residual(df: pd.DataFrame) -> Optional[str]:
    cand = [c for c in df.columns if "Residual" in c and not any(t in c for t in ["_delta", "_rolling_"])]
    return cand[0] if cand else None


def _slice_by_global_index(sub: pd.DataFrame, start: int, end: int, pad: int = 100) -> pd.DataFrame:
    """
    Robust slice of `sub` (filtered to one file) using global indices [start, end],
    expanded by `pad` points on both sides. Works even when index has gaps.
    """
    if sub.empty:
        return sub
    idx = sub.index.to_numpy()
    i0 = np.searchsorted(idx, start, side="left")
    i1 = np.searchsorted(idx, end,   side="right")
    i0 = max(i0 - pad, 0)
    i1 = min(i1 + pad, len(idx))
    return sub.iloc[i0:i1]


def plot_voted_for_file(
    df_file: pd.DataFrame,
    out_dir: str,
    rule: str,
    min_gap: int,
    figsize: Tuple[int, int] = (12, 5),
) -> Optional[str]:
    ensure_dir(out_dir)
    residual_col = _pick_residual(df_file)
    if residual_col is None:
        print("‚ö†Ô∏è No residual column to plot.")
        return None

    voted_rows = extract_voted_rows(df_file, rule=rule)
    episodes = summarize_episodes(voted_rows, min_gap=min_gap)

    plt.figure(figsize=figsize)
    plt.plot(df_file.index, df_file[residual_col], label=residual_col, alpha=0.85)

    if not voted_rows.empty:
        plt.scatter(voted_rows.index, voted_rows[residual_col], s=12, label=f"Voted anomalies ({rule})")

    if not episodes.empty:
        for _, r in episodes.iterrows():
            plt.axvspan(r["start_idx"], r["end_idx"], alpha=0.15, label="Episode")
        handles, labels = plt.gca().get_legend_handles_labels()
        uniq, seen = [], set()
        for h, l in zip(handles, labels):
            if l not in seen:
                uniq.append((h, l)); seen.add(l)
        handles, labels = zip(*uniq)
        plt.legend(handles, labels)
    else:
        plt.legend()

    sf = df_file["source_file"].iloc[0] if "source_file" in df_file.columns else "file"
    plt.title(f"{sf} ‚Äî Residual with voted anomalies & episodes")
    plt.xlabel("Index"); plt.ylabel(residual_col)
    plt.tight_layout()
    out_path = os.path.join(out_dir, f"voted_plot_{safe_name(sf)}.png")
    plt.savefig(out_path, dpi=160); plt.close()
    return out_path


def plot_all_files(combined_df: pd.DataFrame, out_dir: str, rule: str, min_gap: int, max_files: Optional[int] = None):
    paths = []
    if "source_file" not in combined_df.columns:
        print("‚ö†Ô∏è combined_df missing 'source_file'.")
        return paths
    groups = list(combined_df.groupby("source_file"))
    if max_files is not None:
        groups = groups[:max_files]
    for fname, df_file in groups:
        p = plot_voted_for_file(df_file, out_dir=out_dir, rule=rule, min_gap=min_gap)
        if p:
            paths.append(p); print(f"üñºÔ∏è Saved: {p}")
    if not paths:
        print("‚ö†Ô∏è No plots produced.")
    return paths


# =========================================================
# Sensor attribution, clustering & heatmap
# =========================================================
def _residual_cols_base(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def build_sensor_table(
    combined: pd.DataFrame,
    voted_rows: pd.DataFrame,
    episodes_with_reasons: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    base_res = _residual_cols_base(combined)
    if not base_res:
        return pd.DataFrame()

    total_rows = len(combined)
    voted_mask = pd.Series(False, index=combined.index)
    if not voted_rows.empty:
        voted_mask.loc[voted_rows.index] = True

    rows = []
    expected_keys = [
        "anomaly_rate_is",
        "anomaly_rate_ae",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "anomaly_rate_hybrid",
        "anomaly_rate_vote3p",
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
        "episodes_as_primary",
    ]

    for col in base_res:
        stats = {"sensor": col}

        stats["anomaly_rate_is"]   = float(combined["is_anomaly"].sum())   / max(total_rows, 1) if "is_anomaly"   in combined.columns else 0.0
        stats["anomaly_rate_ae"]   = float(combined["ae_is_anomaly"].sum())/ max(total_rows, 1) if "ae_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lof"]  = float(combined["lof_is_anomaly"].sum())/max(total_rows, 1) if "lof_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lstm"] = float(combined["lstm_is_anomaly"].fillna(0).sum())/max(total_rows, 1) if "lstm_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_hybrid"] = float(combined["hybrid_is_anomaly"].sum())/max(total_rows, 1) if "hybrid_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_vote3p"] = float(combined["vote_3plus"].sum())/max(total_rows, 1) if "vote_3plus" in combined.columns else 0.0

        if col in combined.columns and voted_mask.any():
            vals = combined.loc[voted_mask, col].abs()
            stats["mean_abs_resid_voted"] = float(vals.mean()) if not vals.empty else 0.0
            stats["max_abs_resid_voted"]  = float(vals.max())  if not vals.empty else 0.0
        else:
            stats["mean_abs_resid_voted"] = 0.0
            stats["max_abs_resid_voted"]  = 0.0

        if episodes_with_reasons is not None and not episodes_with_reasons.empty and "primary_signal" in episodes_with_reasons.columns:
            stats["episodes_as_primary"] = int((episodes_with_reasons["primary_signal"] == col).sum())
        else:
            stats["episodes_as_primary"] = 0

        for k in expected_keys:
            stats.setdefault(k, 0.0)

        rows.append(stats)

    sensor_df = pd.DataFrame(rows)
    for c in sensor_df.columns:
        if c != "sensor":
            sensor_df[c] = sensor_df[c].fillna(0.0)
    return sensor_df


def cluster_sensors(
    sensor_df: pd.DataFrame,
    n_clusters: int = 3,
    random_state: int = 42,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    if sensor_df.empty or "sensor" not in sensor_df.columns:
        return sensor_df, np.empty((0, 2)), np.empty((0, 2))

    features = sensor_df.drop(columns=["sensor"]).to_numpy(dtype=np.float32)
    if features.shape[0] < n_clusters:
        n_clusters = max(1, features.shape[0])

    scaler = StandardScaler()
    Z = scaler.fit_transform(features)

    km = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state)
    labels = km.fit_predict(Z)

    pca = PCA(n_components=2, random_state=random_state)
    Z2 = pca.fit_transform(Z)
    centers2 = pca.transform(km.cluster_centers_)

    out = sensor_df.copy()
    out["cluster"] = labels

    return out, Z2, centers2


def plot_sensor_bar_top(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metric: str = "episodes_as_primary",
    top_n: int = 15,
    title: Optional[str] = None,
) -> Optional[str]:
    if sensor_df.empty or metric not in sensor_df.columns:
        return None

    ensure_dir(out_dir)
    df = sensor_df.sort_values(metric, ascending=False).head(top_n)

    plt.figure(figsize=(12, 6))
    plt.bar(range(len(df)), df[metric])
    plt.xticks(range(len(df)), [s.replace("Force_", "F_") for s in df["sensor"]], rotation=60, ha="right")
    plt.ylabel(metric)
    plt.title(title or f"Top {top_n} sensors by {metric}")
    plt.tight_layout()

    path = os.path.join(out_dir, f"top_sensors_{metric}.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_clusters_scatter(
    sensor_df_with_cluster: pd.DataFrame,
    Z2: np.ndarray,
    centers2: np.ndarray,
    out_dir: str,
    title: str = "Sensor clusters (PCA of features)",
) -> Optional[str]:
    if sensor_df_with_cluster.empty or Z2.size == 0:
        return None

    ensure_dir(out_dir)
    plt.figure(figsize=(9, 7))

    clusters = sorted(sensor_df_with_cluster["cluster"].unique().tolist())
    for cl in clusters:
        mask = sensor_df_with_cluster["cluster"] == cl
        pts = Z2[mask.values]
        plt.scatter(pts[:, 0], pts[:, 1], label=f"cluster {cl}", alpha=0.8, s=36)

    if centers2.size:
        plt.scatter(centers2[:, 0], centers2[:, 1], marker="X", s=120, label="centers")

    try:
        top_lab = sensor_df_with_cluster.sort_values("episodes_as_primary", ascending=False).head(10).index
        for idx in top_lab:
            plt.text(Z2[idx, 0], Z2[idx, 1], sensor_df_with_cluster.loc[idx, "sensor"], fontsize=8)
    except Exception:
        pass

    plt.title(title)
    plt.xlabel("PCA-1"); plt.ylabel("PCA-2")
    plt.legend()
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_clusters_pca.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_heatmap(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metrics: Optional[List[str]] = None,
    title: str = "Sensor anomaly fingerprint (rates & magnitudes)",
) -> Optional[str]:
    if sensor_df.empty:
        return None
    ensure_dir(out_dir)

    desired = [
        "anomaly_rate_vote3p",
        "anomaly_rate_hybrid",
        "anomaly_rate_ae",
        "anomaly_rate_is",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
    ]
    if metrics is None:
        metrics = desired

    available = [m for m in metrics if m in sensor_df.columns]
    if not available:
        print("‚ö†Ô∏è No requested heatmap metrics are present in sensor_df. Skipping heatmap.")
        return None
    if len(available) < len(metrics):
        missing = [m for m in metrics if m not in sensor_df.columns]
        print(f"‚ÑπÔ∏è Skipping missing metrics in heatmap: {missing}")
    metrics = available

    key_rank = "episodes_as_primary" if "episodes_as_primary" in sensor_df.columns else metrics[0]
    keep = sensor_df.sort_values(key_rank, ascending=False).head(25)

    M = keep[metrics].to_numpy(dtype=np.float32)
    plt.figure(figsize=(12, 8))
    plt.imshow(M, aspect="auto")
    plt.colorbar()
    plt.yticks(range(len(keep)), keep["sensor"])
    plt.xticks(range(len(metrics)), metrics, rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_fingerprint_heatmap.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


# =========================================================
# Report (robust overlays with padding)
# =========================================================
def _overlay_episode_plot(df: pd.DataFrame, episode_row: pd.Series, cfg: dict, ax=None):
    start, end = int(episode_row["start_idx"]), int(episode_row["end_idx"])
    primary = episode_row.get("primary_signal", "")
    demand_col, measured_col = _paired_columns(primary, cfg)
    pad = int(cfg.get("report", {}).get("pad_points", 100))

    if ax is None:
        ax = plt.gca()

    # If df contains multiple files, filter to the right one first
    if "source_file" in df.columns and "source_file" in episode_row:
        df = df.loc[df["source_file"] == episode_row["source_file"]]

    # Robust slice by global indices + padding
    window = _slice_by_global_index(df, start, end, pad=pad)
    if window.empty:
        ax.set_title(f"Episode {start}‚Äì{end} (EMPTY SLICE)")
        return

    t = window.index.to_numpy()

    if primary in window.columns:
        ax.plot(t, window[primary].values, label=f"{primary}", alpha=0.9)
    if demand_col and demand_col in window.columns:
        ax.plot(t, window[demand_col].values, label=f"{demand_col}", alpha=0.8)
    if measured_col and measured_col in window.columns:
        ax.plot(t, window[measured_col].values, label=f"{measured_col}", alpha=0.8)

    # Shade the exact episode span inside the padded window
    ax.axvspan(start, end, alpha=0.15, label="episode window")

    ax.set_xlabel("Index")
    ax.set_title(f"Episode {start}‚Äì{end}\nprimary={primary}")
    ax.legend(loc="best")


def build_ops_report(
    combined: pd.DataFrame,
    summary: pd.DataFrame,
    sensor_df: pd.DataFrame,
    episodes_scored: pd.DataFrame,
    cfg: dict,
    out_pdf_path: str
):
    ensure_dir(os.path.dirname(out_pdf_path))
    with PdfPages(out_pdf_path) as pdf:

        # Page 1 ‚Äî Anomalies counts by model (dynamic columns, no vote_any)
        plt.figure(figsize=(11, 6))
        plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"] if c in summary.columns]
        summary[plot_cols].plot(kind="bar")
        plt.title("Anomalies per Model per File")
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        pdf.savefig(); plt.close()

        # Page 2 ‚Äî Top sensors by episodes_as_primary
        p1 = plot_sensor_bar_top(sensor_df, out_dir=cfg["io"]["output_folder"], metric="episodes_as_primary", top_n=15,
                                 title="Top sensors by episodes_as_primary")
        if p1 and os.path.exists(p1):
            img = plt.imread(p1)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Page 3 ‚Äî Sensor heatmap (if created)
        p2 = plot_sensor_heatmap(sensor_df, out_dir=cfg["io"]["output_folder"])
        if p2 and os.path.exists(p2):
            img = plt.imread(p2)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Pages 4+ ‚Äî Episode overlays (per-file selection with caps)
        if not episodes_scored.empty:
            candidates = episodes_scored.copy()
            sort_keys = [c for c in ["n_models_mean", "hybrid_score_mean", "iso_score_mean", "ae_error_mean"] if c in candidates.columns]
            if sort_keys:
                candidates = candidates.sort_values(sort_keys, ascending=False)

            n_per_file = int(cfg.get("report", {}).get("top_n_per_file", 2))
            max_pages  = int(cfg.get("report", {}).get("max_pages", 12))

            pages = 0
            for sf, grp in candidates.groupby("source_file"):
                for _, epi in grp.head(n_per_file).iterrows():
                    if pages >= max_pages:
                        break
                    plt.figure(figsize=(11, 5))
                    _overlay_episode_plot(combined, epi, cfg, ax=plt.gca())
                    hw = epi.get("hardware_class", "Unknown")
                    why = epi.get("hardware_why", "")
                    lag_s = epi.get("lag_seconds", np.nan)
                    sat   = epi.get("saturation_score", np.nan)
                    drift = epi.get("drift_score", np.nan)
                    vibe  = epi.get("vibe_score", np.nan)
                    txt = (
                        f"hardware: {hw}\n"
                        f"why: {why}\n"
                        f"lag_seconds: {lag_s:.4f}  |  saturation: {sat:.3f}  |  drift: {drift:.3f}  |  vibe: {vibe:.3f}"
                    )
                    plt.gcf().text(0.02, 0.02, txt, ha="left", va="bottom", fontsize=9)
                    plt.tight_layout()
                    pdf.savefig(); plt.close()
                    pages += 1
                if pages >= max_pages:
                    break


# =========================================================
# Config (defaults or JSON)
# =========================================================
def default_config() -> dict:
    return {
        "io": {
            "input_folder": "./Datasets/Datasets",
            "residual_folder": "./Anomaly_detection/residual_created/",
            "output_folder": "./Anomaly_detection/code/outputs/"
        },
        "residuals": {
            "enabled": True,
            "demand_token": "Demand",
            "measured_token": "Measured",
            "residual_token": "Residual",
            "suffix": "_residual"
        },
        "features": {
            "window": 5,
            "max_features": 500
        },
        "threshold": {
            "k": 3.5
        },
        "ae": {
            "epochs": 50,
            "lr": 0.001
        },
        "lstm": {
            "seq_len": 5,
            "hidden_dim": 64,
            "patience": 5,
            "max_sequences": 3000,
            "downsample": 5
        },
        "lof": {
            "n_neighbors": 20
        },
        "hybrid": {                     # Hybrid scoring config
            "enabled": True,
            "method": "robust_z",      # "robust_z" | "percentile"
            "min_components": 2,       # require at least N model scores present
            "weights": {               # relative importance (doesn't need to sum to 1)
                "iso_score": 0.20,
                "lof_score": 0.20,
                "ae_error": 0.30,
                "lstm_error": 0.30
            }
        },
        "hybrid_threshold": {          # How to threshold hybrid_score
            "mode": "quantile",        # "mad" or "quantile"
            "k": 3.5,                  # used only if mode="mad"
            "quantile": 0.99           # top 1% as anomalies (fallback if MAD degenerates)
        },
        "voting": {
            "rule": "vote_3plus",    # "vote_3plus" | "agreement_all_4" | "any"
            "min_gap": 1
        },
        "plots": {
            "enabled": True,
            "max_files": None,
            "emit_rate_plot": True     # also write an anomaly RATE bar chart (% rows)
        },
        "runtime": {
            "use_float32": True
        },
        "signals": {
            "sample_rate_hz": 100.0,     # set None if unknown
            "residual_token": "Residual",
            "demand_token": "Demand",
            "measured_token": "Measured"
        },
        "scores": {
            "saturation_pct": 95.0,
            "resid_prominence_pct": 95.0,
            "min_window_len": 5
        },
        "report": {
            "enabled": True,
            "top_n_episodes": 3,     # still used elsewhere; harmless
            "top_n_per_file": 2,     # NEW: how many episodes per file to show
            "max_pages": 12,         # NEW: cap to avoid huge PDFs
            "pad_points": 100        # NEW: context on each side of an episode in plots
        }
    }


def load_config_from_path_or_default(path: Optional[str]) -> dict:
    if path and os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    print("‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.")
    return default_config()


# =========================================================
# Per-file processing & Pipeline
# =========================================================
def process_file(file_path: str, cfg: Dict, logger=print) -> Optional[pd.DataFrame]:
    df = pd.read_csv(file_path)
    file_name = os.path.basename(file_path).replace(".csv", "")

    residual_cols = [c for c in df.columns if "Residual" in c]
    if not residual_cols:
        logger(f"‚ùå Skipped {file_name}: No residuals found.")
        return None

    X, feature_cols, fe_stats = prepare_features(
        df, residual_cols,
        window=cfg["features"]["window"],
        max_features=cfg["features"]["max_features"],
        logger=logger,
    )
    if X is None or len(feature_cols) == 0 or X.empty:
        logger(f"‚ùå Skipped {file_name}: invalid or empty features")
        return None

    _, X_scaled, X_tensor = scale_features(X, use_float32=cfg["runtime"]["use_float32"])

    iso_labels, iso_scores, iso_thr = isolation_forest_detect(X_scaled, k=cfg["threshold"]["k"])
    df.loc[X.index, "is_anomaly"] = iso_labels
    df.loc[X.index, "iso_score"] = iso_scores
    df.loc[X.index, "iso_thr"] = iso_thr

    ae_labels, ae_errors, ae_thr = dense_autoencoder_detect(
        X_tensor, k=cfg["threshold"]["k"], ae_epochs=cfg["ae"]["epochs"], ae_lr=cfg["ae"]["lr"]
    )
    df.loc[X.index, "ae_is_anomaly"] = ae_labels
    df.loc[X.index, "ae_error"] = ae_errors
    df.loc[X.index, "ae_thr"] = ae_thr

    lof_labels, lof_scores, lof_thr = lof_detect(
        X_scaled, k=cfg["threshold"]["k"], n_neighbors=cfg["lof"]["n_neighbors"]
    )
    df.loc[X.index, "lof_is_anomaly"] = lof_labels
    df.loc[X.index, "lof_score"] = lof_scores
    df.loc[X.index, "lof_thr"] = lof_thr

    lstm_labels, lstm_errors, lstm_idx, lstm_thr = lstm_autoencoder_detect(
        X_scaled,
        k=cfg["threshold"]["k"],
        seq_len=cfg["lstm"]["seq_len"],
        hidden_dim=cfg["lstm"]["hidden_dim"],
        patience=cfg["lstm"]["patience"],
        max_sequences=cfg["lstm"]["max_sequences"],
        downsample=cfg["lstm"]["downsample"],
    )
    if len(lstm_idx) > 0:
        df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
        df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
        df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
    else:
        df["lstm_is_anomaly"] = 0
        df["lstm_error"] = np.nan
        df["lstm_thr"] = np.nan

    # --- Hybrid score (weighted fusion on valid rows)
    mask_idx = X.index
    df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)

    hs = df.loc[mask_idx, "hybrid_score"].to_numpy()
    if np.isnan(hs).all():
        df.loc[mask_idx, "hybrid_is_anomaly"] = 0
        df.loc[mask_idx, "hybrid_thr"] = np.nan
    else:
        mode = cfg.get("hybrid_threshold", {}).get("mode", "mad")
        if mode == "quantile":
            q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
            thr = np.nanpercentile(hs, 100 * q)
            labels = (hs > thr).astype(int)
        else:
            thr, labels = robust_threshold(hs, k=cfg["hybrid_threshold"].get("k", 3.5), tail="high")
            # Fallback if too many positives (MAD degenerate)
            if np.nanmean(labels) > 0.5:
                q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
                thr = np.nanpercentile(hs, 100 * q)
                labels = (hs > thr).astype(int)
        df.loc[mask_idx, "hybrid_is_anomaly"] = labels
        df.loc[mask_idx, "hybrid_thr"] = thr

    df = generate_votes(df)  # adds vote_3plus (no vote_any)
    df["source_file"] = file_name
    df["fe_reused"] = fe_stats.get("reused", 0)
    df["fe_generated"] = fe_stats.get("generated", 0)

    logger(
        f"[{file_name}] iso={int(df['is_anomaly'].sum())} | "
        f"ae={int(df['ae_is_anomaly'].sum())} | "
        f"lof={int(df['lof_is_anomaly'].sum())} | "
        f"lstm={int(df['lstm_is_anomaly'].fillna(0).sum())} | "
        f"hyb={int(df['hybrid_is_anomaly'].sum())} | "
        f"vote3+={int(df['vote_3plus'].sum())}"
    )
    return df


def run_pipeline(cfg: Dict):
    logger = print

    # A) residuals (optional)
    if cfg["residuals"]["enabled"]:
        logger("üîß Creating residuals...")
        create_residuals_for_folder(
            in_folder=cfg["io"]["input_folder"],
            out_folder=cfg["io"]["residual_folder"],
            demand_token=cfg["residuals"]["demand_token"],
            measured_token=cfg["residuals"]["measured_token"],
            residual_token=cfg["residuals"]["residual_token"],
            skip_if_exists=True,
            suffix=cfg["residuals"]["suffix"],
            logger=logger,
        )
        data_folder = cfg["io"]["residual_folder"]
    else:
        data_folder = cfg["io"]["input_folder"]

    # B) per-file
    all_dfs = []
    for file in os.listdir(data_folder):
        if file.endswith(".csv"):
            out = process_file(os.path.join(data_folder, file), cfg, logger=logger)
            if out is not None:
                all_dfs.append(out)

    if not all_dfs:
        logger("‚ùå No files processed.")
        return

    combined = pd.concat(all_dfs, ignore_index=True)
    ensure_dir(cfg["io"]["output_folder"])

    combined_path = os.path.join(cfg["io"]["output_folder"], "combined_anomaly_results.csv")
    combined.to_csv(combined_path, index=False)

    # Summary (counts) ‚Äî no vote_any
    cols = ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"]
    cols = [c for c in cols if c in combined.columns]
    summary = combined.groupby("source_file")[cols].sum()
    summary["total_anomalies"] = summary.sum(axis=1)
    summary_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_summary.csv")
    summary.to_csv(summary_path)

    logger(f"‚úÖ Saved row-level: {combined_path}")
    logger(f"‚úÖ Saved summary:   {summary_path}")

    # C) Counts plot
    plt.figure(figsize=(12, 6))
    plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"] if c in summary.columns]
    summary[plot_cols].plot(kind="bar", figsize=(12, 6))
    plt.title("Anomalies per Model per File")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    bar_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_plot.png")
    plt.savefig(bar_path); plt.close()
    logger(f"üñºÔ∏è Saved: {bar_path}")

    # C2) Rate plot (% rows)
    if cfg.get("plots", {}).get("emit_rate_plot", True):
        sizes = combined.groupby("source_file").size().rename("n_rows")
        summary_rates = summary.div(sizes, axis=0) * 100.0
        plt.figure(figsize=(12, 6))
        rate_cols = [c for c in plot_cols if c in summary_rates.columns]
        summary_rates[rate_cols].plot(kind="bar", figsize=(12, 6))
        plt.title("Anomaly RATE per Model per File (%)")
        plt.ylabel("Percent of rows (%)")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        rate_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_rate_plot.png")
        plt.savefig(rate_path); plt.close()
        logger(f"üñºÔ∏è Saved: {rate_path}")

    # D) Voted rows + episodes + reasons
    voted_rows = extract_voted_rows(combined, rule=cfg["voting"]["rule"])
    voted_dir = os.path.join(cfg["io"]["output_folder"], "voted_outputs")
    ensure_dir(voted_dir)
    voted_rows_path = os.path.join(voted_dir, "voted_anomalies_rows.csv")
    voted_rows.to_csv(voted_rows_path, index=False)

    episodes = summarize_episodes(voted_rows, min_gap=cfg["voting"]["min_gap"])
    episodes_path = os.path.join(voted_dir, "voted_anomaly_episodes.csv")
    episodes.to_csv(episodes_path, index=False)

    episodes_with_reasons = attach_episode_reasons(combined, episodes, top_k=1)
    episodes_with_reasons = enrich_hardware_mapping(episodes_with_reasons)
    episodes_scored = score_episodes(combined, episodes_with_reasons, cfg)

    episodes_reason_path = os.path.join(voted_dir, "voted_anomaly_episodes_with_reasons.csv")
    episodes_scored_path = os.path.join(voted_dir, "voted_anomaly_episodes_with_reasons_and_scores.csv")
    episodes_with_reasons.to_csv(episodes_reason_path, index=False)
    episodes_scored.to_csv(episodes_scored_path, index=False)
    logger(f"‚úÖ Saved episodes+reason: {episodes_reason_path}")
    logger(f"‚úÖ Saved episodes+scores: {episodes_scored_path}")

    # Debug: how many episodes per file
    print("\nEPISODES PER FILE (after voting & scoring):")
    if not episodes_scored.empty:
        print(episodes_scored.groupby("source_file").size().to_string())
    else:
        print("No episodes found under current voting rule.")

    # E) Per-file plots with voted overlays (optional)
    if cfg["plots"]["enabled"]:
        _ = plot_all_files(
            combined_df=combined,
            out_dir=voted_dir,
            rule=cfg["voting"]["rule"],
            min_gap=cfg["voting"]["min_gap"],
            max_files=cfg["plots"]["max_files"],
        )

    # F) Sensor table + clustering visuals
    sensor_df = build_sensor_table(combined, voted_rows, episodes_with_reasons=episodes_with_reasons)
    sensor_df_path = os.path.join(voted_dir, "sensor_table.csv")
    sensor_df.to_csv(sensor_df_path, index=False)
    logger(f"‚úÖ Saved sensor table: {sensor_df_path}")

    clustered, Z2, centers2 = cluster_sensors(sensor_df, n_clusters=3, random_state=42)
    _ = plot_sensor_clusters_scatter(clustered, Z2, centers2, out_dir=voted_dir)
    _ = plot_sensor_heatmap(sensor_df, out_dir=voted_dir)
    _ = plot_sensor_bar_top(sensor_df, out_dir=voted_dir, metric="episodes_as_primary", top_n=15)

    # G) PDF report
    if cfg.get("report", {}).get("enabled", True):
        pdf_path = os.path.join(cfg["io"]["output_folder"], "ops_report.pdf")
        build_ops_report(
            combined=combined,
            summary=summary,
            sensor_df=sensor_df,
            episodes_scored=episodes_scored,
            cfg=cfg,
            out_pdf_path=pdf_path
        )
        logger(f"üìÑ Ops report saved: {pdf_path}")


# =========================================================
# Entrypoint
# =========================================================
def main():
    parser = argparse.ArgumentParser(description="Anomaly Detection Product")
    parser.add_argument("--config", type=str, default=None, help="Path to config JSON")
    args, _ = parser.parse_known_args()  # allows notebook execution

    cfg = load_config_from_path_or_default(args.config)
    run_pipeline(cfg)


if __name__ == "__main__":
    main()


‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.
üîß Creating residuals...
‚Ü©Ô∏è  Skip residual (exists): Dataset01_Ski_CrossbeamYawNotPerforming_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset02_Matrix_Rocker4EncoderNotWorking_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset03_Wushu_YawTrapezoidNormal_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset04_Wushu_YawWaveletSqueak_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset05_Wushu_LaneChanges_ModelBump_residual.csv
‚ùå Failed to read Dataset07_Demo_Spa_GT.csv: No columns to parse from file
‚Ü©Ô∏è  Skip residual (exists): Dataset08_Demo_Jiggler_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset09_Demo_VerticalChirp_residual.csv
‚ùå Failed to read Dataset10_Demo_MillbrookHills.csv: No columns to parse from file


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset01_Ski_CrossbeamYawNotPerforming_residual] iso=6081 | ae=11638 | lof=1724 | lstm=218 | hyb=1802 | vote3+=217
‚ùå Skipped Dataset02_Matrix_Rocker4EncoderNotWorking_residual: No residuals found.


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset03_Wushu_YawTrapezoidNormal_residual] iso=4353 | ae=15526 | lof=2735 | lstm=142 | hyb=3815 | vote3+=355


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset04_Wushu_YawWaveletSqueak_residual] iso=5912 | ae=5678 | lof=1179 | lstm=721 | hyb=190 | vote3+=682


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset05_Wushu_LaneChanges_ModelBump_residual] iso=7951 | ae=8057 | lof=1177 | lstm=336 | hyb=1312 | vote3+=119
‚ùå Skipped Dataset08_Demo_Jiggler_residual: No residuals found.
‚ùå Skipped Dataset09_Demo_VerticalChirp_residual: No residuals found.
‚úÖ Saved row-level: ./Anomaly_detection/code/outputs/combined_anomaly_results.csv
‚úÖ Saved summary:   ./Anomaly_detection/code/outputs/model_comparison_summary.csv
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/model_comparison_plot.png
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/model_comparison_rate_plot.png
‚úÖ Saved episodes+reason: ./Anomaly_detection/code/outputs/voted_outputs\voted_anomaly_episodes_with_reasons.csv
‚úÖ Saved episodes+scores: ./Anomaly_detection/code/outputs/voted_outputs\voted_anomaly_episodes_with_reasons_and_scores.csv

EPISODES PER FILE (after voting & scoring):
source_file
Dataset01_Ski_CrossbeamYawNotPerforming_residual    136
Dataset03_Wushu_YawTrapezoidNormal_residual         200
Dataset04_Wushu_YawWave



üìÑ Ops report saved: ./Anomaly_detection/code/outputs/ops_report.pdf


<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1100x600 with 0 Axes>

In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Anomaly Detection Product (single script) ‚Äî Hybrid-Driven Episodes (no vote_any)
- Residual creation (Demand - Measured -> Residual)
- Feature engineering (reuse if already present)
- Scaling once -> shared across models
- Models: IsolationForest, LOF, Dense AE, LSTM AE
- Dynamic thresholds (MAD); Hybrid: robust fusion + MAD/quantile with fallback
- Hybrid-DRIVEN selection for episodes (default), with flexible selection rules
- Robust overlay plots with context padding (fixes blank episode pages)
- Episode explanations + hardware mapping + root-cause scoring
- Sensor ranking, clustering & heatmap
- Multi-page PDF Ops Report (includes hybrid)
- Emits a RATE plot (% rows) for fair cross-file comparison
"""

import os
import json
import argparse
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Torch
import torch
import torch.nn as nn
import torch.optim as optim

# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


# =========================================================
# Utils
# =========================================================
def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def safe_name(name: str) -> str:
    return "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in str(name))


# =========================================================
# Residual creation (optional)
# =========================================================
def create_residuals_for_folder(
    in_folder: str,
    out_folder: str,
    demand_token: str = "Demand",
    measured_token: str = "Measured",
    residual_token: str = "Residual",
    skip_if_exists: bool = True,
    suffix: str = "_residual",
    logger=print,
) -> None:
    ensure_dir(out_folder)
    for file in os.listdir(in_folder):
        if not file.endswith(".csv"):
            continue

        in_path = os.path.join(in_folder, file)
        out_name = file.replace(".csv", f"{suffix}.csv")
        out_path = os.path.join(out_folder, out_name)

        if skip_if_exists and os.path.exists(out_path):
            logger(f"‚Ü©Ô∏è  Skip residual (exists): {out_name}")
            continue

        try:
            df = pd.read_csv(in_path)
        except Exception as e:
            logger(f"‚ùå Failed to read {file}: {e}")
            continue

        cols = df.columns.tolist()
        made_any = False
        for col in cols:
            if demand_token in col:
                measured_col = col.replace(demand_token, measured_token)
                if measured_col in df.columns:
                    residual_col = col.replace(demand_token, residual_token)
                    df[residual_col] = df[col] - df[measured_col]
                    made_any = True

        if not made_any:
            logger(f"‚ö†Ô∏è  No Demand/Measured pairs found in {file}.")
        df.to_csv(out_path, index=False)
        logger(f"‚úÖ Residual CSV saved: {os.path.basename(out_path)}")


# =========================================================
# Scaling + robust threshold (MAD)
# =========================================================
def scale_features(X: pd.DataFrame, use_float32: bool = True):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    if use_float32:
        X_scaled = X_scaled.astype("float32")
    X_tensor = torch.from_numpy(X_scaled)
    return scaler, X_scaled, X_tensor


def robust_threshold(
    values: np.ndarray,
    k: float = 3.5,
    tail: str = "high",
    min_anoms: int = 5,
) -> Tuple[float, np.ndarray]:
    v = np.asarray(values)
    mask = ~np.isnan(v)
    v = v[mask]
    if v.size == 0:
        return (np.inf if tail == "high" else -np.inf), np.zeros_like(values, dtype=int)

    med = np.median(v)
    mad = np.median(np.abs(v - med)) + 1e-12
    if tail == "high":
        thr = med + k * 1.4826 * mad
        labels = (values > thr).astype(int)
    else:
        thr = med - k * 1.4826 * mad
        labels = (values < thr).astype(int)

    # relax if too strict on large arrays
    if labels.sum() < min_anoms and v.size >= 100:
        for k_relax in (3.0, 2.5, 2.0):
            if tail == "high":
                thr = med + k_relax * 1.4826 * mad
                labels = (values > thr).astype(int)
            else:
                thr = med - k_relax * 1.4826 * mad
                labels = (values < thr).astype(int)
            if labels.sum() >= min_anoms:
                break

    return thr, labels


# =========================================================
# Feature Engineering
# =========================================================
def prepare_features(
    df: pd.DataFrame,
    residual_cols: List[str],
    window: int = 5,
    max_features: int = 500,
    logger=print,
) -> Tuple[pd.DataFrame, List[str], Dict[str, int]]:
    already_done = any(f"{residual_cols[0]}_delta" in df.columns for _ in residual_cols)
    stats = {"reused": 0, "generated": 0}

    if already_done:
        feature_cols = [
            c for c in df.columns
            if any(k in c for k in ["Residual", "_delta", "_rolling_mean", "_rolling_std"])
        ]
        X = df[feature_cols].dropna()
        stats["reused"] = len(feature_cols)
        logger(f"üîÅ Reusing {len(feature_cols)} engineered features.")
        return X, feature_cols, stats

    # Generate
    for col in residual_cols:
        df[f"{col}_delta"] = df[col].diff()
        df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
        df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()

    feature_cols = []
    for col in residual_cols:
        feature_cols += [
            col,
            f"{col}_delta",
            f"{col}_rolling_mean_{window}",
            f"{col}_rolling_std_{window}",
        ]

    X = df[feature_cols].dropna()
    stats["generated"] = len(feature_cols)
    logger(f"üõ†Ô∏è  Generated {len(feature_cols)} features (window={window}).")

    if X.shape[1] > max_features:
        logger(f"‚ùå Too many features ({X.shape[1]} > {max_features}). Skipping file.")
        return pd.DataFrame(), [], stats

    return X, feature_cols, stats


# =========================================================
# Models
# =========================================================
class Autoencoder(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 32), nn.ReLU(), nn.Linear(32, 8))
        self.decoder = nn.Sequential(nn.Linear(8, 32), nn.ReLU(), nn.Linear(32, input_dim))

    def forward(self, x):
        return self.decoder(self.encoder(x))


def dense_autoencoder_detect(
    X_tensor: torch.Tensor, k: float, ae_epochs: int, ae_lr: float
) -> Tuple[np.ndarray, np.ndarray, float]:
    model = Autoencoder(X_tensor.shape[1])
    opt = optim.Adam(model.parameters(), lr=ae_lr)
    crit = nn.MSELoss()

    for _ in range(ae_epochs):
        opt.zero_grad()
        out = model(X_tensor)
        loss = crit(out, X_tensor)
        loss.backward()
        opt.step()

    with torch.no_grad():
        rec = model(X_tensor)
        errors = torch.mean((X_tensor - rec) ** 2, dim=1).cpu().numpy()

    thr, labels = robust_threshold(errors, k=k, tail="high")
    return labels.astype(int), errors, thr


class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        _, (h, _) = self.encoder(x)  # [1, B, H]
        repeated = h.repeat(x.size(1), 1, 1).transpose(0, 1)  # [B, T, H]
        decoded, _ = self.decoder(repeated)
        return decoded


def make_sequences(X: np.ndarray, seq_len: int) -> Tuple[np.ndarray, List[int]]:
    seqs, idxs = [], []
    for i in range(len(X) - seq_len):
        seqs.append(X[i:i+seq_len])
        idxs.append(i + seq_len - 1)
    return np.array(seqs), idxs


def lstm_autoencoder_detect(
    X_scaled: np.ndarray,
    k: float,
    seq_len: int,
    hidden_dim: int,
    patience: int,
    max_sequences: int,
    downsample: int,
) -> Tuple[np.ndarray, np.ndarray, List[int], float]:
    try:
        Xds = X_scaled[::downsample]
        if len(Xds) < seq_len:
            return np.array([]), np.array([]), [], np.nan

        Xseq, idxs = make_sequences(Xds, seq_len)
        if len(Xseq) > max_sequences:
            Xseq, idxs = Xseq[:max_sequences], idxs[:max_sequences]

        Xt = torch.tensor(Xseq, dtype=torch.float32)
        model = LSTMAutoencoder(Xt.shape[2], hidden_dim)
        opt = optim.Adam(model.parameters(), lr=1e-3)
        crit = nn.MSELoss()

        best, wait = float("inf"), 0
        for _ in range(100):
            model.train()
            opt.zero_grad()
            out = model(Xt)
            loss = crit(out, Xt)
            loss.backward()
            opt.step()
            if loss.item() < best:
                best, wait = loss.item(), 0
            else:
                wait += 1
                if wait >= patience:
                    break

        with torch.no_grad():
            model.eval()
            out = model(Xt)
            errors = torch.mean((Xt - out) ** 2, dim=(1, 2)).cpu().numpy()

        thr, labels = robust_threshold(errors, k=k, tail="high")
        return labels.astype(int), errors, idxs, thr
    except RuntimeError as e:
        print(f"‚ö†Ô∏è LSTM memory error: {e}")
        return np.array([]), np.array([]), [], np.nan


def isolation_forest_detect(X_scaled: np.ndarray, k: float) -> Tuple[np.ndarray, np.ndarray, float]:
    iso = IsolationForest(contamination="auto", n_estimators=300, random_state=42)
    iso.fit(X_scaled)
    scores = -iso.decision_function(X_scaled)  # higher = more anomalous
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


def lof_detect(X_scaled: np.ndarray, k: float, n_neighbors: int) -> Tuple[np.ndarray, np.ndarray, float]:
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination="auto")
    _ = lof.fit_predict(X_scaled)  # populates negative_outlier_factor_
    scores = -lof.negative_outlier_factor_
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


# =========================================================
# Hybrid scoring utilities
# =========================================================
def _robust_z_pos(x: np.ndarray) -> np.ndarray:
    """Right-tail robust z-score (>=0 when above median)."""
    x = np.asarray(x, dtype=float)
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med)) + 1e-12
    z = (x - med) / (1.4826 * mad)
    z = np.where(np.isnan(z), np.nan, z)
    return np.maximum(z, 0.0)  # only right tail counts as anomalous


def _percentile01(x: np.ndarray) -> np.ndarray:
    """Map to [0,1] by robust percentiles (2‚Äì98). Values outside clamp."""
    x = np.asarray(x, dtype=float)
    lo = np.nanpercentile(x, 2)
    hi = np.nanpercentile(x, 98)
    rng = max(hi - lo, 1e-12)
    y = (x - lo) / rng
    return np.clip(y, 0.0, 1.0)


def compute_hybrid_score_on_mask(df: pd.DataFrame, cfg: dict, mask_idx) -> np.ndarray:
    """
    Compute hybrid only on valid rows (mask_idx). Returns array the size of df,
    NaN elsewhere. Requires >= min_components present.
    """
    out = np.full(len(df), np.nan)
    if not cfg.get("hybrid", {}).get("enabled", False):
        return out

    # Weights guard
    wmap = cfg.get("hybrid", {}).get("weights")
    if not isinstance(wmap, dict) or not wmap:
        wmap = {"iso_score": 0.25, "lof_score": 0.25, "ae_error": 0.25, "lstm_error": 0.25}

    method = cfg["hybrid"].get("method", "robust_z")
    min_components = int(cfg["hybrid"].get("min_components", 2))

    use = df.loc[mask_idx]  # restrict to valid feature rows

    comps = [c for c in ["iso_score", "lof_score", "ae_error", "lstm_error"] if c in use.columns and c in wmap]
    if not comps:
        return out

    parts = []
    for c in comps:
        arr = use[c].to_numpy(dtype=float)
        if method == "robust_z":
            norm = _robust_z_pos(arr)
            norm = np.clip(norm, 0, 10.0) / 10.0  # compress extreme tails to ~[0,1]
        else:
            norm = _percentile01(arr)
        parts.append((norm, float(wmap[c])))

    num = np.zeros(len(use), dtype=float)
    den = np.zeros(len(use), dtype=float)
    present = np.zeros(len(use), dtype=int)

    for norm, w in parts:
        m = ~np.isnan(norm)
        num[m] += w * norm[m]
        den[m] += w
        present[m] += 1

    hybrid_local = np.where((den > 0) & (present >= min_components), num / den, np.nan)
    out[np.asarray(mask_idx)] = hybrid_local
    return out


# =========================================================
# Voting (for reference only), selection, episodes, explanations
# =========================================================
def generate_votes(df: pd.DataFrame) -> pd.DataFrame:
    df["agreement_all_4"] = (
        (df.get("ae_is_anomaly", 0) == 1)
        & (df.get("is_anomaly", 0) == 1)
        & (df.get("lof_is_anomaly", 0) == 1)
        & (df.get("lstm_is_anomaly", 0) == 1)
    ).astype(int)
    df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
    df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
    return df  # NOTE: vote_any removed


def extract_selected_rows(df: pd.DataFrame, rule: str = "hybrid") -> pd.DataFrame:
    """
    Selection rules that drive downstream episodes & reasoning.
    Supported:
      - "hybrid": hybrid_is_anomaly == 1
      - "vote_3plus": ‚â•3/4 models (conservative)
      - "agreement_all_4": all four agree (very strict)
      - "hybrid_or_vote3p": union of hybrid and vote_3plus
      - "hybrid_and_vote3p": intersection of hybrid and vote_3plus
    """
    rule = (rule or "hybrid").lower()
    if rule == "hybrid":
        mask = df.get("hybrid_is_anomaly", 0) == 1
    elif rule == "vote_3plus":
        mask = df.get("vote_3plus", 0) == 1
    elif rule == "agreement_all_4":
        mask = df.get("agreement_all_4", 0) == 1
    elif rule == "hybrid_or_vote3p":
        mask = ((df.get("hybrid_is_anomaly", 0) == 1) | (df.get("vote_3plus", 0) == 1))
    elif rule == "hybrid_and_vote3p":
        mask = ((df.get("hybrid_is_anomaly", 0) == 1) & (df.get("vote_3plus", 0) == 1))
    else:
        raise ValueError(f"Unknown selection rule: {rule}")
    return df.loc[mask].copy()


def _group_runs(idxs: np.ndarray, min_gap: int = 1) -> List[Tuple[int, int]]:
    if len(idxs) == 0:
        return []
    runs, start, prev = [], int(idxs[0]), int(idxs[0])
    for i in idxs[1:]:
        if int(i) - prev <= min_gap:
            prev = int(i)
            continue
        runs.append((start, prev))
        start = int(i); prev = int(i)
    runs.append((start, prev))
    return runs


def summarize_episodes(selected_df: pd.DataFrame, min_gap: int = 1) -> pd.DataFrame:
    """
    Build episodes PER FILE from 'selected' rows (e.g., hybrid-driven).
    """
    if selected_df.empty:
        cols = ["source_file", "start_idx", "end_idx", "length", "n_models_mean"]
        return pd.DataFrame(columns=cols)

    rows = []
    if "source_file" in selected_df.columns:
        groups = selected_df.groupby("source_file")
    else:
        groups = [("", selected_df)]

    for sf, g in groups:
        idxs = g.index.to_numpy()
        if idxs.size == 0:
            continue
        runs = _group_runs(idxs, min_gap=min_gap)
        for start, end in runs:
            chunk = g.loc[start:end]
            row = {
                "source_file": sf,
                "start_idx": int(start),
                "end_idx": int(end),
                "length": int(end - start + 1),
                "n_models_mean": float(
                    chunk[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1).mean()
                ),
            }
            for c in ["iso_score", "ae_error", "lof_score", "lstm_error", "hybrid_score"]:
                if c in chunk.columns:
                    row[f"{c}_max"] = float(chunk[c].max())
                    row[f"{c}_mean"] = float(chunk[c].mean())
            rows.append(row)

    return pd.DataFrame(rows)


def _base_residual_columns(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def _models_string(chunk: pd.DataFrame) -> str:
    model_cols = [c for c in ["is_anomaly", "ae_is_anomaly", "lof_is_anomaly", "lstm_is_anomaly", "hybrid_is_anomaly"] if c in chunk.columns]
    if not model_cols:
        return "no-model-flags"
    means = chunk[model_cols].mean()
    active = [m.replace("_is_anomaly", "").upper() for m, v in means.items() if v >= 0.5]
    return ", ".join(active) if active else "weak/isolated flags"


def attach_episode_reasons(
    combined_df: pd.DataFrame, episodes_df: pd.DataFrame, top_k: int = 1
) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df

    base_res = _base_residual_columns(combined_df)
    if not base_res:
        episodes_df["primary_signal"] = ""
        episodes_df["reason"] = "no residual columns present"
        episodes_df["suspected_sensor"] = ""
        return episodes_df

    out = []
    for _, epi in episodes_df.iterrows():
        start, end = int(epi["start_idx"]), int(epi["end_idx"])
        mask = combined_df["source_file"] == epi["source_file"] if "source_file" in combined_df.columns else slice(None)
        chunk = combined_df.loc[mask].loc[start:end]

        if chunk.empty:
            epi["primary_signal"] = ""
            epi["reason"] = "empty slice"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats = []
        for col in base_res:
            if col in chunk.columns:
                stats.append((col, float(chunk[col].abs().max())))
        if not stats:
            epi["primary_signal"] = ""
            epi["reason"] = "no residual stats"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats.sort(key=lambda x: x[1], reverse=True)
        primary_signal, primary_val = stats[:top_k][0]
        models_str = _models_string(chunk)
        measured_col = primary_signal.replace("Residual", "Measured")
        suspected = measured_col if (measured_col in combined_df.columns) else "unknown-measured-sensor"

        epi["primary_signal"] = primary_signal
        epi["reason"] = f"max |{primary_signal}| = {primary_val:.3f}; models: {models_str}"
        epi["suspected_sensor"] = suspected
        out.append(epi)

    return pd.DataFrame(out)


# =========================================================
# Hardware mapping + root cause scoring
# =========================================================
HARDWARE_MAP = [
    ("Force_",         "Actuator/LoadCell",  "Force didn‚Äôt follow demand ‚Üí friction/lag/saturation/load-cell drift likely"),
    ("Encoder_",       "Encoders/Alignment", "Pose/velocity mismatch ‚Üí quantization/missing counts/misalignment"),
    ("Accelerometer_", "IMU/Accelerometer",  "Vibration bursts ‚Üí mounting/looseness/thermal drift"),
    ("State_",         "Control/Timing",     "Requested vs achieved state diverged ‚Üí scheduler limits/controller windup"),
]

def map_signal_to_hardware(primary_signal: str):
    for needle, hw, why in HARDWARE_MAP:
        if needle in primary_signal:
            return hw, why
    return "Unknown", "No mapping rule matched"


def enrich_hardware_mapping(episodes_df: pd.DataFrame) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df
    episodes_df = episodes_df.copy()
    episodes_df["hardware_class"] = ""
    episodes_df["hardware_why"] = ""
    for i, r in episodes_df.iterrows():
        hw, why = map_signal_to_hardware(r.get("primary_signal", ""))
        episodes_df.at[i, "hardware_class"] = hw
        episodes_df.at[i, "hardware_why"]   = why
    return episodes_df


def _paired_columns(primary_signal: str, cfg: dict) -> Tuple[Optional[str], Optional[str]]:
    resid_tok  = cfg["signals"]["residual_token"]
    demand_tok = cfg["signals"]["demand_token"]
    measured_tok = cfg["signals"]["measured_token"]
    if resid_tok not in primary_signal:
        return None, None
    demand_col   = primary_signal.replace(resid_tok, demand_tok)
    measured_col = primary_signal.replace(resid_tok, measured_tok)
    return demand_col, measured_col


def _nan_ok(arr: np.ndarray) -> np.ndarray:
    return np.asarray(arr, dtype=float)


def _cross_correlation_lag(x: np.ndarray, y: np.ndarray, sample_rate_hz: Optional[float]) -> Tuple[float, int]:
    x = _nan_ok(x); y = _nan_ok(y)
    if len(x) != len(y) or len(x) == 0:
        return (np.nan, 0)
    x = x - np.nanmean(x); y = y - np.nanmean(y)
    x = np.nan_to_num(x);  y = np.nan_to_num(y)
    corr = np.correlate(x, y, mode="full")
    lags = np.arange(-len(x)+1, len(x))
    k = int(np.argmax(corr))
    lag_samples = int(lags[k])
    lag_seconds = lag_samples / sample_rate_hz if sample_rate_hz and sample_rate_hz > 0 else np.nan
    return (lag_seconds, lag_samples)


def _saturation_score(demand: np.ndarray, residual: np.ndarray, cfg: dict) -> float:
    if len(demand) == 0 or len(residual) == 0:
        return 0.0
    p_dem = np.nanpercentile(demand, cfg["scores"]["saturation_pct"])
    p_res = np.nanpercentile(np.abs(residual), cfg["scores"]["resid_prominence_pct"])
    near_limit = demand >= p_dem
    large_res  = np.abs(residual) >= p_res
    both = np.logical_and(near_limit, large_res)
    return float(np.nansum(both)) / max(1, len(demand))


def _drift_score(residual: np.ndarray) -> float:
    residual = _nan_ok(residual)
    mu = float(np.nanmean(residual))
    sd = float(np.nanstd(residual)) + 1e-9
    return abs(mu) / sd


def _vibration_score(signal: np.ndarray, sample_rate_hz: Optional[float]) -> float:
    if not sample_rate_hz or sample_rate_hz <= 0 or len(signal) < 8:
        return np.nan
    sig = np.nan_to_num(signal - np.nanmean(signal))
    fft = np.fft.rfft(sig)
    power = np.abs(fft) ** 2
    freqs = np.fft.rfftfreq(len(sig), d=1.0 / sample_rate_hz)
    if len(freqs) == 0:
        return np.nan
    cutoff = 0.25 * (sample_rate_hz / 2.0)
    mask_hi = freqs >= cutoff
    num = float(np.nansum(power[mask_hi]))
    den = float(np.nansum(power) + 1e-12)
    return num / den


def score_episodes(combined_df: pd.DataFrame, episodes_df: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df
    out = episodes_df.copy()
    sr = cfg["signals"]["sample_rate_hz"]
    min_len = cfg["scores"]["min_window_len"]

    if "primary_signal" not in out.columns:
        out["primary_signal"] = ""

    for i, r in out.iterrows():
        start, end = int(r["start_idx"]), int(r["end_idx"])
        if end - start + 1 < min_len:
            out.at[i, "lag_seconds"] = np.nan
            out.at[i, "lag_samples"] = 0
            out.at[i, "saturation_score"] = 0.0
            out.at[i, "drift_score"] = 0.0
            out.at[i, "vibe_score"] = np.nan
            continue

        if "source_file" in combined_df.columns and "source_file" in out.columns and "source_file" in r:
            chunk = combined_df.loc[(combined_df["source_file"] == r["source_file"])].loc[start:end]
        else:
            chunk = combined_df.loc[start:end]

        primary = r.get("primary_signal", "")
        demand_col, measured_col = _paired_columns(primary, cfg)

        resid = chunk[primary].values if (primary in chunk.columns) else np.array([])
        dem   = chunk[demand_col].values if (demand_col and demand_col in chunk.columns) else np.array([])
        meas  = chunk[measured_col].values if (measured_col and measured_col in chunk.columns) else np.array([])

        lag_s, lag_k = _cross_correlation_lag(dem, meas, sr) if (len(dem) and len(meas)) else (np.nan, 0)
        sat_sc = _saturation_score(dem, resid, cfg) if (len(dem) and len(resid)) else 0.0
        dr_sc  = _drift_score(resid) if len(resid) else 0.0
        if "Accelerometer_" in primary and primary in chunk.columns:
            vibe_sc = _vibration_score(chunk[primary].values, sr)
        else:
            vibe_sc = _vibration_score(resid, sr)

        out.at[i, "lag_seconds"]       = lag_s
        out.at[i, "lag_samples"]       = int(lag_k)
        out.at[i, "saturation_score"]  = float(sat_sc)
        out.at[i, "drift_score"]       = float(dr_sc)
        out.at[i, "vibe_score"]        = float(vibe_sc) if vibe_sc == vibe_sc else np.nan
    return out


# =========================================================
# Plotting helpers (per-file selected overlays)
# =========================================================
def _pick_residual(df: pd.DataFrame) -> Optional[str]:
    cand = [c for c in df.columns if "Residual" in c and not any(t in c for t in ["_delta", "_rolling_"])]
    return cand[0] if cand else None


def _slice_by_global_index(sub: pd.DataFrame, start: int, end: int, pad: int = 100) -> pd.DataFrame:
    """
    Robust slice of `sub` (filtered to one file) using global indices [start, end],
    expanded by `pad` points on both sides. Works even when index has gaps.
    """
    if sub.empty:
        return sub
    idx = sub.index.to_numpy()
    i0 = np.searchsorted(idx, start, side="left")
    i1 = np.searchsorted(idx, end,   side="right")
    i0 = max(i0 - pad, 0)
    i1 = min(i1 + pad, len(idx))
    return sub.iloc[i0:i1]


def plot_selected_for_file(
    df_file: pd.DataFrame,
    out_dir: str,
    rule: str,
    min_gap: int,
    figsize: Tuple[int, int] = (12, 5),
) -> Optional[str]:
    ensure_dir(out_dir)
    residual_col = _pick_residual(df_file)
    if residual_col is None:
        print("‚ö†Ô∏è No residual column to plot.")
        return None

    selected_rows = extract_selected_rows(df_file, rule=rule)
    episodes = summarize_episodes(selected_rows, min_gap=min_gap)

    plt.figure(figsize=figsize)
    plt.plot(df_file.index, df_file[residual_col], label=residual_col, alpha=0.85)

    if not selected_rows.empty:
        plt.scatter(selected_rows.index, selected_rows[residual_col], s=12, label=f"Selected anomalies ({rule})")

    if not episodes.empty:
        for _, r in episodes.iterrows():
            plt.axvspan(r["start_idx"], r["end_idx"], alpha=0.15, label="Episode")
        handles, labels = plt.gca().get_legend_handles_labels()
        uniq, seen = [], set()
        for h, l in zip(handles, labels):
            if l not in seen:
                uniq.append((h, l)); seen.add(l)
        handles, labels = zip(*uniq)
        plt.legend(handles, labels)
    else:
        plt.legend()

    sf = df_file["source_file"].iloc[0] if "source_file" in df_file.columns else "file"
    plt.title(f"{sf} ‚Äî Residual with selected anomalies & episodes ({rule})")
    plt.xlabel("Index"); plt.ylabel(residual_col)
    plt.tight_layout()
    out_path = os.path.join(out_dir, f"selected_plot_{safe_name(sf)}.png")
    plt.savefig(out_path, dpi=160); plt.close()
    return out_path


def plot_all_files(combined_df: pd.DataFrame, out_dir: str, rule: str, min_gap: int, max_files: Optional[int] = None):
    paths = []
    if "source_file" not in combined_df.columns:
        print("‚ö†Ô∏è combined_df missing 'source_file'.")
        return paths
    groups = list(combined_df.groupby("source_file"))
    if max_files is not None:
        groups = groups[:max_files]
    for fname, df_file in groups:
        p = plot_selected_for_file(df_file, out_dir=out_dir, rule=rule, min_gap=min_gap)
        if p:
            paths.append(p); print(f"üñºÔ∏è Saved: {p}")
    if not paths:
        print("‚ö†Ô∏è No plots produced.")
    return paths


# =========================================================
# Sensor attribution, clustering & heatmap
# =========================================================
def _residual_cols_base(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def build_sensor_table(
    combined: pd.DataFrame,
    selected_rows: pd.DataFrame,
    episodes_with_reasons: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    base_res = _residual_cols_base(combined)
    if not base_res:
        return pd.DataFrame()

    total_rows = len(combined)
    selected_mask = pd.Series(False, index=combined.index)
    if not selected_rows.empty:
        selected_mask.loc[selected_rows.index] = True

    rows = []
    expected_keys = [
        "anomaly_rate_is",
        "anomaly_rate_ae",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "anomaly_rate_hybrid",
        "anomaly_rate_vote3p",
        "mean_abs_resid_selected",
        "max_abs_resid_selected",
        "episodes_as_primary",
    ]

    for col in base_res:
        stats = {"sensor": col}

        stats["anomaly_rate_is"]   = float(combined["is_anomaly"].sum())   / max(total_rows, 1) if "is_anomaly"   in combined.columns else 0.0
        stats["anomaly_rate_ae"]   = float(combined["ae_is_anomaly"].sum())/ max(total_rows, 1) if "ae_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lof"]  = float(combined["lof_is_anomaly"].sum())/max(total_rows, 1) if "lof_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lstm"] = float(combined["lstm_is_anomaly"].fillna(0).sum())/max(total_rows, 1) if "lstm_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_hybrid"] = float(combined["hybrid_is_anomaly"].sum())/max(total_rows, 1) if "hybrid_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_vote3p"] = float(combined["vote_3plus"].sum())/max(total_rows, 1) if "vote_3plus" in combined.columns else 0.0

        if col in combined.columns and selected_mask.any():
            vals = combined.loc[selected_mask, col].abs()
            stats["mean_abs_resid_selected"] = float(vals.mean()) if not vals.empty else 0.0
            stats["max_abs_resid_selected"]  = float(vals.max())  if not vals.empty else 0.0
        else:
            stats["mean_abs_resid_selected"] = 0.0
            stats["max_abs_resid_selected"]  = 0.0

        if episodes_with_reasons is not None and not episodes_with_reasons.empty and "primary_signal" in episodes_with_reasons.columns:
            stats["episodes_as_primary"] = int((episodes_with_reasons["primary_signal"] == col).sum())
        else:
            stats["episodes_as_primary"] = 0

        for k in expected_keys:
            stats.setdefault(k, 0.0)

        rows.append(stats)

    sensor_df = pd.DataFrame(rows)
    for c in sensor_df.columns:
        if c != "sensor":
            sensor_df[c] = sensor_df[c].fillna(0.0)
    return sensor_df


def cluster_sensors(
    sensor_df: pd.DataFrame,
    n_clusters: int = 3,
    random_state: int = 42,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    if sensor_df.empty or "sensor" not in sensor_df.columns:
        return sensor_df, np.empty((0, 2)), np.empty((0, 2))

    features = sensor_df.drop(columns=["sensor"]).to_numpy(dtype=np.float32)
    if features.shape[0] < n_clusters:
        n_clusters = max(1, features.shape[0])

    scaler = StandardScaler()
    Z = scaler.fit_transform(features)

    km = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state)
    labels = km.fit_predict(Z)

    pca = PCA(n_components=2, random_state=random_state)
    Z2 = pca.fit_transform(Z)
    centers2 = pca.transform(km.cluster_centers_)

    out = sensor_df.copy()
    out["cluster"] = labels

    return out, Z2, centers2


def plot_sensor_bar_top(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metric: str = "episodes_as_primary",
    top_n: int = 15,
    title: Optional[str] = None,
) -> Optional[str]:
    if sensor_df.empty or metric not in sensor_df.columns:
        return None

    ensure_dir(out_dir)
    df = sensor_df.sort_values(metric, ascending=False).head(top_n)

    plt.figure(figsize=(12, 6))
    plt.bar(range(len(df)), df[metric])
    plt.xticks(range(len(df)), [s.replace("Force_", "F_") for s in df["sensor"]], rotation=60, ha="right")
    plt.ylabel(metric)
    plt.title(title or f"Top {top_n} sensors by {metric}")
    plt.tight_layout()

    path = os.path.join(out_dir, f"top_sensors_{metric}.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_clusters_scatter(
    sensor_df_with_cluster: pd.DataFrame,
    Z2: np.ndarray,
    centers2: np.ndarray,
    out_dir: str,
    title: str = "Sensor clusters (PCA of features)",
) -> Optional[str]:
    if sensor_df_with_cluster.empty or Z2.size == 0:
        return None

    ensure_dir(out_dir)
    plt.figure(figsize=(9, 7))

    clusters = sorted(sensor_df_with_cluster["cluster"].unique().tolist())
    for cl in clusters:
        mask = sensor_df_with_cluster["cluster"] == cl
        pts = Z2[mask.values]
        plt.scatter(pts[:, 0], pts[:, 1], label=f"cluster {cl}", alpha=0.8, s=36)

    if centers2.size:
        plt.scatter(centers2[:, 0], centers2[:, 1], marker="X", s=120, label="centers")

    try:
        top_lab = sensor_df_with_cluster.sort_values("episodes_as_primary", ascending=False).head(10).index
        for idx in top_lab:
            plt.text(Z2[idx, 0], Z2[idx, 1], sensor_df_with_cluster.loc[idx, "sensor"], fontsize=8)
    except Exception:
        pass

    plt.title(title)
    plt.xlabel("PCA-1"); plt.ylabel("PCA-2")
    plt.legend()
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_clusters_pca.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_heatmap(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metrics: Optional[List[str]] = None,
    title: str = "Sensor anomaly fingerprint (rates & magnitudes)",
) -> Optional[str]:
    if sensor_df.empty:
        return None
    ensure_dir(out_dir)

    desired = [
        "anomaly_rate_hybrid",
        "anomaly_rate_vote3p",
        "anomaly_rate_ae",
        "anomaly_rate_is",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "mean_abs_resid_selected",
        "max_abs_resid_selected",
    ]
    if metrics is None:
        metrics = desired

    available = [m for m in metrics if m in sensor_df.columns]
    if not available:
        print("‚ö†Ô∏è No requested heatmap metrics are present in sensor_df. Skipping heatmap.")
        return None
    if len(available) < len(metrics):
        missing = [m for m in metrics if m not in sensor_df.columns]
        print(f"‚ÑπÔ∏è Skipping missing metrics in heatmap: {missing}")
    metrics = available

    key_rank = "episodes_as_primary" if "episodes_as_primary" in sensor_df.columns else metrics[0]
    keep = sensor_df.sort_values(key_rank, ascending=False).head(25)

    M = keep[metrics].to_numpy(dtype=np.float32)
    plt.figure(figsize=(12, 8))
    plt.imshow(M, aspect="auto")
    plt.colorbar()
    plt.yticks(range(len(keep)), keep["sensor"])
    plt.xticks(range(len(metrics)), metrics, rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_fingerprint_heatmap.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


# =========================================================
# Report (robust overlays with padding)
# =========================================================
def _overlay_episode_plot(df: pd.DataFrame, episode_row: pd.Series, cfg: dict, ax=None):
    start, end = int(episode_row["start_idx"]), int(episode_row["end_idx"])
    primary = episode_row.get("primary_signal", "")
    demand_col, measured_col = _paired_columns(primary, cfg)
    pad = int(cfg.get("report", {}).get("pad_points", 100))

    if ax is None:
        ax = plt.gca()

    # If df contains multiple files, filter to the right one first
    if "source_file" in df.columns and "source_file" in episode_row:
        df = df.loc[df["source_file"] == episode_row["source_file"]]

    # Robust slice by global indices + padding
    window = _slice_by_global_index(df, start, end, pad=pad)
    if window.empty:
        ax.set_title(f"Episode {start}‚Äì{end} (EMPTY SLICE)")
        return

    t = window.index.to_numpy()

    if primary in window.columns:
        ax.plot(t, window[primary].values, label=f"{primary}", alpha=0.9)
    if demand_col and demand_col in window.columns:
        ax.plot(t, window[demand_col].values, label=f"{demand_col}", alpha=0.8)
    if measured_col and measured_col in window.columns:
        ax.plot(t, window[measured_col].values, label=f"{measured_col}", alpha=0.8)

    # Shade the exact episode span inside the padded window
    ax.axvspan(start, end, alpha=0.15, label="episode window")

    ax.set_xlabel("Index")
    ax.set_title(f"Episode {start}‚Äì{end}\nprimary={primary}")
    ax.legend(loc="best")


def build_ops_report(
    combined: pd.DataFrame,
    summary: pd.DataFrame,
    sensor_df: pd.DataFrame,
    episodes_scored: pd.DataFrame,
    cfg: dict,
    out_pdf_path: str
):
    ensure_dir(os.path.dirname(out_pdf_path))
    with PdfPages(out_pdf_path) as pdf:

        # Page 1 ‚Äî Anomalies counts by model (dynamic columns)
        plt.figure(figsize=(11, 6))
        plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"] if c in summary.columns]
        summary[plot_cols].plot(kind="bar")
        plt.title("Anomalies per Model per File")
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        pdf.savefig(); plt.close()

        # Page 2 ‚Äî Top sensors by episodes_as_primary
        p1 = plot_sensor_bar_top(sensor_df, out_dir=cfg["io"]["output_folder"], metric="episodes_as_primary", top_n=15,
                                 title="Top sensors by episodes_as_primary")
        if p1 and os.path.exists(p1):
            img = plt.imread(p1)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Page 3 ‚Äî Sensor heatmap (if created)
        p2 = plot_sensor_heatmap(sensor_df, out_dir=cfg["io"]["output_folder"])
        if p2 and os.path.exists(p2):
            img = plt.imread(p2)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Pages 4+ ‚Äî Episode overlays (per-file selection with caps)
        if not episodes_scored.empty:
            candidates = episodes_scored.copy()
            sort_keys = [c for c in ["n_models_mean", "hybrid_score_mean", "iso_score_mean", "ae_error_mean"] if c in candidates.columns]
            if sort_keys:
                candidates = candidates.sort_values(sort_keys, ascending=False)

            n_per_file = int(cfg.get("report", {}).get("top_n_per_file", 2))
            max_pages  = int(cfg.get("report", {}).get("max_pages", 12))

            pages = 0
            for sf, grp in candidates.groupby("source_file"):
                for _, epi in grp.head(n_per_file).iterrows():
                    if pages >= max_pages:
                        break
                    plt.figure(figsize=(11, 5))
                    _overlay_episode_plot(combined, epi, cfg, ax=plt.gca())
                    hw = epi.get("hardware_class", "Unknown")
                    why = epi.get("hardware_why", "")
                    lag_s = epi.get("lag_seconds", np.nan)
                    sat   = epi.get("saturation_score", np.nan)
                    drift = epi.get("drift_score", np.nan)
                    vibe  = epi.get("vibe_score", np.nan)
                    txt = (
                        f"hardware: {hw}\n"
                        f"why: {why}\n"
                        f"lag_seconds: {lag_s:.4f}  |  saturation: {sat:.3f}  |  drift: {drift:.3f}  |  vibe: {vibe:.3f}"
                    )
                    plt.gcf().text(0.02, 0.02, txt, ha="left", va="bottom", fontsize=9)
                    plt.tight_layout()
                    pdf.savefig(); plt.close()
                    pages += 1
                if pages >= max_pages:
                    break


# =========================================================
# Config (defaults or JSON)
# =========================================================
def default_config() -> dict:
    return {
        "io": {
            "input_folder": "./Datasets/Datasets",
            "residual_folder": "./Anomaly_detection/residual_created/",
            "output_folder": "./Anomaly_detection/hybrid/outputs/"
        },
        "residuals": {
            "enabled": True,
            "demand_token": "Demand",
            "measured_token": "Measured",
            "residual_token": "Residual",
            "suffix": "_residual"
        },
        "features": {
            "window": 5,
            "max_features": 500
        },
        "threshold": {
            "k": 3.5
        },
        "ae": {
            "epochs": 50,
            "lr": 0.001
        },
        "lstm": {
            "seq_len": 5,
            "hidden_dim": 64,
            "patience": 5,
            "max_sequences": 3000,
            "downsample": 5
        },
        "lof": {
            "n_neighbors": 20
        },
        "hybrid": {                     # Hybrid scoring config
            "enabled": True,
            "method": "robust_z",      # "robust_z" | "percentile"
            "min_components": 2,       # require at least N model scores present
            "weights": {               # relative importance (doesn't need to sum to 1)
                "iso_score": 0.20,
                "lof_score": 0.20,
                "ae_error": 0.30,
                "lstm_error": 0.30
            }
        },
        "hybrid_threshold": {          # How to threshold hybrid_score
            "mode": "quantile",        # "mad" or "quantile"
            "k": 3.5,                  # used only if mode="mad"
            "quantile": 0.99           # top 1% as anomalies (fallback if MAD degenerates)
        },
        "voting": {                    # kept for reference plots; not used for selection unless you choose it
            "rule": "vote_3plus",
            "min_gap": 1
        },
        "selection": {                 # NEW: drives episodes & downstream logic
            "rule": "hybrid"           # "hybrid" | "vote_3plus" | "agreement_all_4" | "hybrid_or_vote3p" | "hybrid_and_vote3p"
        },
        "plots": {
            "enabled": True,
            "max_files": None,
            "emit_rate_plot": True     # also write an anomaly RATE bar chart (% rows)
        },
        "runtime": {
            "use_float32": True
        },
        "signals": {
            "sample_rate_hz": 100.0,     # set None if unknown
            "residual_token": "Residual",
            "demand_token": "Demand",
            "measured_token": "Measured"
        },
        "scores": {
            "saturation_pct": 95.0,
            "resid_prominence_pct": 95.0,
            "min_window_len": 5
        },
        "report": {
            "enabled": True,
            "top_n_episodes": 3,     # still used elsewhere; harmless
            "top_n_per_file": 2,     # how many episodes per file to show
            "max_pages": 12,         # cap to avoid huge PDFs
            "pad_points": 100        # context on each side of an episode in plots
        }
    }


def load_config_from_path_or_default(path: Optional[str]) -> dict:
    if path and os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    print("‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.")
    return default_config()


# =========================================================
# Per-file processing & Pipeline
# =========================================================
def process_file(file_path: str, cfg: Dict, logger=print) -> Optional[pd.DataFrame]:
    df = pd.read_csv(file_path)
    file_name = os.path.basename(file_path).replace(".csv", "")

    residual_cols = [c for c in df.columns if "Residual" in c]
    if not residual_cols:
        logger(f"‚ùå Skipped {file_name}: No residuals found.")
        return None

    X, feature_cols, fe_stats = prepare_features(
        df, residual_cols,
        window=cfg["features"]["window"],
        max_features=cfg["features"]["max_features"],
        logger=logger,
    )
    if X is None or len(feature_cols) == 0 or X.empty:
        logger(f"‚ùå Skipped {file_name}: invalid or empty features")
        return None

    _, X_scaled, X_tensor = scale_features(X, use_float32=cfg["runtime"]["use_float32"])

    iso_labels, iso_scores, iso_thr = isolation_forest_detect(X_scaled, k=cfg["threshold"]["k"])
    df.loc[X.index, "is_anomaly"] = iso_labels
    df.loc[X.index, "iso_score"] = iso_scores
    df.loc[X.index, "iso_thr"] = iso_thr

    ae_labels, ae_errors, ae_thr = dense_autoencoder_detect(
        X_tensor, k=cfg["threshold"]["k"], ae_epochs=cfg["ae"]["epochs"], ae_lr=cfg["ae"]["lr"]
    )
    df.loc[X.index, "ae_is_anomaly"] = ae_labels
    df.loc[X.index, "ae_error"] = ae_errors
    df.loc[X.index, "ae_thr"] = ae_thr

    lof_labels, lof_scores, lof_thr = lof_detect(
        X_scaled, k=cfg["threshold"]["k"], n_neighbors=cfg["lof"]["n_neighbors"]
    )
    df.loc[X.index, "lof_is_anomaly"] = lof_labels
    df.loc[X.index, "lof_score"] = lof_scores
    df.loc[X.index, "lof_thr"] = lof_thr

    lstm_labels, lstm_errors, lstm_idx, lstm_thr = lstm_autoencoder_detect(
        X_scaled,
        k=cfg["threshold"]["k"],
        seq_len=cfg["lstm"]["seq_len"],
        hidden_dim=cfg["lstm"]["hidden_dim"],
        patience=cfg["lstm"]["patience"],
        max_sequences=cfg["lstm"]["max_sequences"],
        downsample=cfg["lstm"]["downsample"],
    )
    if len(lstm_idx) > 0:
        df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
        df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
        df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
    else:
        df["lstm_is_anomaly"] = 0
        df["lstm_error"] = np.nan
        df["lstm_thr"] = np.nan

    # --- Hybrid score (weighted fusion on valid rows)
    mask_idx = X.index
    df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)

    hs = df.loc[mask_idx, "hybrid_score"].to_numpy()
    if np.isnan(hs).all():
        df.loc[mask_idx, "hybrid_is_anomaly"] = 0
        df.loc[mask_idx, "hybrid_thr"] = np.nan
    else:
        mode = cfg.get("hybrid_threshold", {}).get("mode", "mad")
        if mode == "quantile":
            q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
            thr = np.nanpercentile(hs, 100 * q)
            labels = (hs > thr).astype(int)
        else:
            thr, labels = robust_threshold(hs, k=cfg["hybrid_threshold"].get("k", 3.5), tail="high")
            # Fallback if too many positives (MAD degenerate)
            if np.nanmean(labels) > 0.5:
                q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
                thr = np.nanpercentile(hs, 100 * q)
                labels = (hs > thr).astype(int)
        df.loc[mask_idx, "hybrid_is_anomaly"] = labels
        df.loc[mask_idx, "hybrid_thr"] = thr

    # Votes (kept for reference/plots)
    df = generate_votes(df)

    df["source_file"] = file_name
    df["fe_reused"] = fe_stats.get("reused", 0)
    df["fe_generated"] = fe_stats.get("generated", 0)

    logger(
        f"[{file_name}] iso={int(df['is_anomaly'].sum())} | "
        f"ae={int(df['ae_is_anomaly'].sum())} | "
        f"lof={int(df['lof_is_anomaly'].sum())} | "
        f"lstm={int(df['lstm_is_anomaly'].fillna(0).sum())} | "
        f"hyb={int(df['hybrid_is_anomaly'].sum())} | "
        f"vote3+={int(df['vote_3plus'].sum())}"
    )
    return df


def run_pipeline(cfg: Dict):
    logger = print

    # A) residuals (optional)
    if cfg["residuals"]["enabled"]:
        logger("üîß Creating residuals...")
        create_residuals_for_folder(
            in_folder=cfg["io"]["input_folder"],
            out_folder=cfg["io"]["residual_folder"],
            demand_token=cfg["residuals"]["demand_token"],
            measured_token=cfg["residuals"]["measured_token"],
            residual_token=cfg["residuals"]["residual_token"],
            skip_if_exists=True,
            suffix=cfg["residuals"]["suffix"],
            logger=logger,
        )
        data_folder = cfg["io"]["residual_folder"]
    else:
        data_folder = cfg["io"]["input_folder"]

    # B) per-file
    all_dfs = []
    for file in os.listdir(data_folder):
        if file.endswith(".csv"):
            out = process_file(os.path.join(data_folder, file), cfg, logger=logger)
            if out is not None:
                all_dfs.append(out)

    if not all_dfs:
        logger("‚ùå No files processed.")
        return

    combined = pd.concat(all_dfs, ignore_index=True)
    ensure_dir(cfg["io"]["output_folder"])

    combined_path = os.path.join(cfg["io"]["output_folder"], "combined_anomaly_results.csv")
    combined.to_csv(combined_path, index=False)

    # Summary (counts)
    cols = ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"]
    cols = [c for c in cols if c in combined.columns]
    summary = combined.groupby("source_file")[cols].sum()
    summary["total_anomalies"] = summary.sum(axis=1)
    summary_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_summary.csv")
    summary.to_csv(summary_path)

    logger(f"‚úÖ Saved row-level: {combined_path}")
    logger(f"‚úÖ Saved summary:   {summary_path}")

    # C) Counts plot
    plt.figure(figsize=(12, 6))
    plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"] if c in summary.columns]
    summary[plot_cols].plot(kind="bar", figsize=(12, 6))
    plt.title("Anomalies per Model per File")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    bar_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_plot.png")
    plt.savefig(bar_path); plt.close()
    logger(f"üñºÔ∏è Saved: {bar_path}")

    # C2) Rate plot (% rows)
    if cfg.get("plots", {}).get("emit_rate_plot", True):
        sizes = combined.groupby("source_file").size().rename("n_rows")
        summary_rates = summary.div(sizes, axis=0) * 100.0
        plt.figure(figsize=(12, 6))
        rate_cols = [c for c in plot_cols if c in summary_rates.columns]
        summary_rates[rate_cols].plot(kind="bar", figsize=(12, 6))
        plt.title("Anomaly RATE per Model per File (%)")
        plt.ylabel("Percent of rows (%)")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        rate_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_rate_plot.png")
        plt.savefig(rate_path); plt.close()
        logger(f"üñºÔ∏è Saved: {rate_path}")

    # D) Selection (HYBRID by default) + episodes + reasons
    selection_rule = cfg.get("selection", {}).get("rule", "hybrid")
    selected_rows = extract_selected_rows(combined, rule=selection_rule)
    sel_dir = os.path.join(cfg["io"]["output_folder"], "selected_outputs")
    ensure_dir(sel_dir)
    selected_rows_path = os.path.join(sel_dir, f"selected_rows_{selection_rule}.csv")
    selected_rows.to_csv(selected_rows_path, index=False)

    episodes = summarize_episodes(selected_rows, min_gap=cfg["voting"]["min_gap"])
    episodes_path = os.path.join(sel_dir, f"episodes_{selection_rule}.csv")
    episodes.to_csv(episodes_path, index=False)

    episodes_with_reasons = attach_episode_reasons(combined, episodes, top_k=1)
    episodes_with_reasons = enrich_hardware_mapping(episodes_with_reasons)
    episodes_scored = score_episodes(combined, episodes_with_reasons, cfg)

    episodes_reason_path = os.path.join(sel_dir, f"episodes_with_reasons_{selection_rule}.csv")
    episodes_scored_path = os.path.join(sel_dir, f"episodes_with_reasons_and_scores_{selection_rule}.csv")
    episodes_with_reasons.to_csv(episodes_reason_path, index=False)
    episodes_scored.to_csv(episodes_scored_path, index=False)
    logger(f"‚úÖ Saved episodes+reason: {episodes_reason_path}")
    logger(f"‚úÖ Saved episodes+scores: {episodes_scored_path}")

    # Debug: how many episodes per file
    print(f"\nEPISODES PER FILE (selection rule = {selection_rule}):")
    if not episodes_scored.empty:
        print(episodes_scored.groupby("source_file").size().to_string())
    else:
        print("No episodes found under current selection rule.")

    # E) Per-file plots with selected overlays (optional)
    if cfg["plots"]["enabled"]:
        _ = plot_all_files(
            combined_df=combined,
            out_dir=sel_dir,
            rule=selection_rule,
            min_gap=cfg["voting"]["min_gap"],
            max_files=cfg["plots"]["max_files"],
        )

    # F) Sensor table + clustering visuals (using SELECTED mask)
    sensor_df = build_sensor_table(combined, selected_rows, episodes_with_reasons=episodes_with_reasons)
    sensor_df_path = os.path.join(sel_dir, "sensor_table.csv")
    sensor_df.to_csv(sensor_df_path, index=False)
    logger(f"‚úÖ Saved sensor table: {sensor_df_path}")

    clustered, Z2, centers2 = cluster_sensors(sensor_df, n_clusters=3, random_state=42)
    _ = plot_sensor_clusters_scatter(clustered, Z2, centers2, out_dir=sel_dir)
    _ = plot_sensor_heatmap(sensor_df, out_dir=sel_dir)
    _ = plot_sensor_bar_top(sensor_df, out_dir=sel_dir, metric="episodes_as_primary", top_n=15)

    # G) PDF report
    if cfg.get("report", {}).get("enabled", True):
        pdf_path = os.path.join(cfg["io"]["output_folder"], "ops_report.pdf")
        build_ops_report(
            combined=combined,
            summary=summary,
            sensor_df=sensor_df,
            episodes_scored=episodes_scored,
            cfg=cfg,
            out_pdf_path=pdf_path
        )
        logger(f"üìÑ Ops report saved: {pdf_path}")


# =========================================================
# Entrypoint
# =========================================================
def main():
    parser = argparse.ArgumentParser(description="Anomaly Detection Product")
    parser.add_argument("--config", type=str, default=None, help="Path to config JSON")
    args, _ = parser.parse_known_args()  # allows notebook execution

    cfg = load_config_from_path_or_default(args.config)
    run_pipeline(cfg)


if __name__ == "__main__":
    main()


‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.
üîß Creating residuals...
‚Ü©Ô∏è  Skip residual (exists): Dataset01_Ski_CrossbeamYawNotPerforming_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset02_Matrix_Rocker4EncoderNotWorking_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset03_Wushu_YawTrapezoidNormal_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset04_Wushu_YawWaveletSqueak_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset05_Wushu_LaneChanges_ModelBump_residual.csv
‚ùå Failed to read Dataset07_Demo_Spa_GT.csv: No columns to parse from file
‚Ü©Ô∏è  Skip residual (exists): Dataset08_Demo_Jiggler_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset09_Demo_VerticalChirp_residual.csv
‚ùå Failed to read Dataset10_Demo_MillbrookHills.csv: No columns to parse from file


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset01_Ski_CrossbeamYawNotPerforming_residual] iso=6081 | ae=10734 | lof=1724 | lstm=217 | hyb=1802 | vote3+=217
‚ùå Skipped Dataset02_Matrix_Rocker4EncoderNotWorking_residual: No residuals found.


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset03_Wushu_YawTrapezoidNormal_residual] iso=4353 | ae=13404 | lof=2735 | lstm=141 | hyb=3815 | vote3+=368


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset04_Wushu_YawWaveletSqueak_residual] iso=5912 | ae=5598 | lof=1179 | lstm=721 | hyb=190 | vote3+=662


  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std()
  df[f"{col}_delta"] = df[col].diff()
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean()
  df[f"{col}_rolling_std_{window}"] = df[col].roll

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx)
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generate

[Dataset05_Wushu_LaneChanges_ModelBump_residual] iso=7951 | ae=8071 | lof=1177 | lstm=330 | hyb=1312 | vote3+=118
‚ùå Skipped Dataset08_Demo_Jiggler_residual: No residuals found.
‚ùå Skipped Dataset09_Demo_VerticalChirp_residual: No residuals found.
‚úÖ Saved row-level: ./Anomaly_detection/hybrid/outputs/combined_anomaly_results.csv
‚úÖ Saved summary:   ./Anomaly_detection/hybrid/outputs/model_comparison_summary.csv
üñºÔ∏è Saved: ./Anomaly_detection/hybrid/outputs/model_comparison_plot.png
üñºÔ∏è Saved: ./Anomaly_detection/hybrid/outputs/model_comparison_rate_plot.png


MemoryError: Unable to allocate 2.91 MiB for an array with shape (381474, 1) and data type float64

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [10]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Anomaly Detection Product (single script) ‚Äî Hybrid-driven + Memory-safe
- Residual creation (Demand - Measured -> Residual)
- Feature engineering (reuse if already present) with float32 downcasting
- Scaling once -> shared across models
- Models: IsolationForest, LOF, Dense AE, LSTM AE (each can be skipped on huge files)
- Dynamic thresholds (MAD) with quantile fallback when needed
- HYBRID scoring (weighted fusion) drives selection & episodes by default
- Voting (3+) retained for diagnostics; vote_any removed
- Episodes per file, robust overlays with context padding
- Episode explanations + hardware mapping + root-cause scoring (lag/sat/drift/vibe)
- Sensor ranking, clustering & heatmap
- Multi-page PDF Ops Report: counts, RATE (%), hybrid histogram, sensors, heatmap, episodes
- Memory guards: float32, dataframe downcast, model limits, early gc
"""

import os
import gc
import json
import argparse
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Torch
import torch
import torch.nn as nn
import torch.optim as optim

# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


# =========================================================
# Utils
# =========================================================
def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def safe_name(name: str) -> str:
    return "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in str(name))


def downcast_df_inplace(df: pd.DataFrame, prefer_float32: bool = True) -> pd.DataFrame:
    """Downcast numeric dtypes to save memory."""
    for c in df.columns:
        col = df[c]
        if pd.api.types.is_float_dtype(col) and prefer_float32:
            df[c] = pd.to_numeric(col, downcast="float")
        elif pd.api.types.is_integer_dtype(col):
            df[c] = pd.to_numeric(col, downcast="integer")
    return df


# =========================================================
# Residual creation (optional)
# =========================================================
def create_residuals_for_folder(
    in_folder: str,
    out_folder: str,
    demand_token: str = "Demand",
    measured_token: str = "Measured",
    residual_token: str = "Residual",
    skip_if_exists: bool = True,
    suffix: str = "_residual",
    logger=print,
) -> None:
    ensure_dir(out_folder)
    for file in os.listdir(in_folder):
        if not file.endswith(".csv"):
            continue

        in_path = os.path.join(in_folder, file)
        out_name = file.replace(".csv", f"{suffix}.csv")
        out_path = os.path.join(out_folder, out_name)

        if skip_if_exists and os.path.exists(out_path):
            logger(f"‚Ü©Ô∏è  Skip residual (exists): {out_name}")
            continue

        try:
            df = pd.read_csv(in_path)
        except Exception as e:
            logger(f"‚ùå Failed to read {file}: {e}")
            continue

        cols = df.columns.tolist()
        made_any = False
        for col in cols:
            if demand_token in col:
                measured_col = col.replace(demand_token, measured_token)
                if measured_col in df.columns:
                    residual_col = col.replace(demand_token, residual_token)
                    df[residual_col] = df[col] - df[measured_col]
                    made_any = True

        if not made_any:
            logger(f"‚ö†Ô∏è  No Demand/Measured pairs found in {file}.")
        df.to_csv(out_path, index=False)
        logger(f"‚úÖ Residual CSV saved: {os.path.basename(out_path)}")


# =========================================================
# Scaling + robust threshold (MAD)
# =========================================================
def scale_features(X: pd.DataFrame, use_float32: bool = True):
    """
    Standardize features once and share across models.
    Returns (scaler, X_scaled np.array, X_tensor torch.tensor)
    """
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    if use_float32:
        X_scaled = X_scaled.astype("float32")
    X_tensor = torch.from_numpy(X_scaled)
    return scaler, X_scaled, X_tensor


def robust_threshold(
    values: np.ndarray,
    k: float = 3.5,
    tail: str = "high",
    min_anoms: int = 5,
) -> Tuple[float, np.ndarray]:
    """
    MAD-based threshold: median ¬± k * 1.4826 * MAD
    tail = 'high' (right tail) or 'low' (left tail)
    Returns: (threshold, labels) labels aligned to 'values' (1=anomaly)
    """
    v = np.asarray(values)
    mask = ~np.isnan(v)
    v = v[mask]
    if v.size == 0:
        return (np.inf if tail == "high" else -np.inf), np.zeros_like(values, dtype=int)

    med = np.median(v)
    mad = np.median(np.abs(v - med)) + 1e-12
    if tail == "high":
        thr = med + k * 1.4826 * mad
        labels = (values > thr).astype(int)
    else:
        thr = med - k * 1.4826 * mad
        labels = (values < thr).astype(int)

    # relax if too strict on large arrays
    if labels.sum() < min_anoms and v.size >= 100:
        for k_relax in (3.0, 2.5, 2.0):
            if tail == "high":
                thr = med + k_relax * 1.4826 * mad
                labels = (values > thr).astype(int)
            else:
                thr = med - k_relax * 1.4826 * mad
                labels = (values < thr).astype(int)
            if labels.sum() >= min_anoms:
                break

    return thr, labels


# =========================================================
# Feature Engineering
# =========================================================
def prepare_features(
    df: pd.DataFrame,
    residual_cols: List[str],
    window: int = 5,
    max_features: int = 500,
    logger=print,
) -> Tuple[pd.DataFrame, List[str], Dict[str, int]]:
    """
    Create or reuse features: residual, delta, rolling mean/std
    Returns: X, feature_cols, stats (reused vs generated)
    """
    already_done = any(f"{residual_cols[0]}_delta" in df.columns for _ in residual_cols)
    stats = {"reused": 0, "generated": 0}

    if already_done:
        feature_cols = [
            c for c in df.columns
            if any(k in c for k in ["Residual", "_delta", "_rolling_mean", "_rolling_std"])
        ]
        X = df[feature_cols].dropna().astype("float32")
        stats["reused"] = len(feature_cols)
        logger(f"üîÅ Reusing {len(feature_cols)} engineered features.")
        return X, feature_cols, stats

    # Generate
    for col in residual_cols:
        df[f"{col}_delta"] = df[col].diff().astype("float32")
        df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
        df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")

    feature_cols = []
    for col in residual_cols:
        feature_cols += [
            col,
            f"{col}_delta",
            f"{col}_rolling_mean_{window}",
            f"{col}_rolling_std_{window}",
        ]

    X = df[feature_cols].dropna().astype("float32")
    stats["generated"] = len(feature_cols)
    logger(f"üõ†Ô∏è  Generated {len(feature_cols)} features (window={window}).")

    if X.shape[1] > max_features:
        logger(f"‚ùå Too many features ({X.shape[1]} > {max_features}). Skipping file.")
        return pd.DataFrame(), [], stats

    return X, feature_cols, stats


# =========================================================
# Models
# =========================================================
class Autoencoder(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 32), nn.ReLU(), nn.Linear(32, 8))
        self.decoder = nn.Sequential(nn.Linear(8, 32), nn.ReLU(), nn.Linear(32, input_dim))

    def forward(self, x):
        return self.decoder(self.encoder(x))


def dense_autoencoder_detect(
    X_tensor: torch.Tensor, k: float, ae_epochs: int, ae_lr: float
) -> Tuple[np.ndarray, np.ndarray, float]:
    model = Autoencoder(X_tensor.shape[1])
    opt = optim.Adam(model.parameters(), lr=ae_lr)
    crit = nn.MSELoss()

    for _ in range(ae_epochs):
        opt.zero_grad()
        out = model(X_tensor)
        loss = crit(out, X_tensor)
        loss.backward()
        opt.step()

    with torch.no_grad():
        rec = model(X_tensor)
        errors = torch.mean((X_tensor - rec) ** 2, dim=1).cpu().numpy()

    thr, labels = robust_threshold(errors, k=k, tail="high")
    return labels.astype(int), errors, thr


class LSTMAutoencoder(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        _, (h, _) = self.encoder(x)  # [1, B, H]
        repeated = h.repeat(x.size(1), 1, 1).transpose(0, 1)  # [B, T, H]
        decoded, _ = self.decoder(repeated)
        return decoded


def make_sequences(X: np.ndarray, seq_len: int) -> Tuple[np.ndarray, List[int]]:
    seqs, idxs = [], []
    for i in range(len(X) - seq_len):
        seqs.append(X[i:i+seq_len])
        idxs.append(i + seq_len - 1)
    return np.array(seqs), idxs


def lstm_autoencoder_detect(
    X_scaled: np.ndarray,
    k: float,
    seq_len: int,
    hidden_dim: int,
    patience: int,
    max_sequences: int,
    downsample: int,
) -> Tuple[np.ndarray, np.ndarray, List[int], float]:
    try:
        Xds = X_scaled[::downsample]
        if len(Xds) < seq_len:
            return np.array([]), np.array([]), [], np.nan

        Xseq, idxs = make_sequences(Xds, seq_len)
        if len(Xseq) > max_sequences:
            Xseq, idxs = Xseq[:max_sequences], idxs[:max_sequences]

        Xt = torch.tensor(Xseq, dtype=torch.float32)
        model = LSTMAutoencoder(Xt.shape[2], hidden_dim)
        opt = optim.Adam(model.parameters(), lr=1e-3)
        crit = nn.MSELoss()

        best, wait = float("inf"), 0
        for _ in range(100):
            model.train()
            opt.zero_grad()
            out = model(Xt)
            loss = crit(out, Xt)
            loss.backward()
            opt.step()
            if loss.item() < best:
                best, wait = loss.item(), 0
            else:
                wait += 1
                if wait >= patience:
                    break

        with torch.no_grad():
            model.eval()
            out = model(Xt)
            errors = torch.mean((Xt - out) ** 2, dim=(1, 2)).cpu().numpy()

        thr, labels = robust_threshold(errors, k=k, tail="high")
        return labels.astype(int), errors, idxs, thr
    except RuntimeError as e:
        print(f"‚ö†Ô∏è LSTM memory error: {e}")
        return np.array([]), np.array([]), [], np.nan


def isolation_forest_detect(X_scaled: np.ndarray, k: float) -> Tuple[np.ndarray, np.ndarray, float]:
    iso = IsolationForest(contamination="auto", n_estimators=300, random_state=42)
    iso.fit(X_scaled)
    scores = -iso.decision_function(X_scaled)  # higher = more anomalous
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


def lof_detect(X_scaled: np.ndarray, k: float, n_neighbors: int) -> Tuple[np.ndarray, np.ndarray, float]:
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination="auto")
    _ = lof.fit_predict(X_scaled)  # populates negative_outlier_factor_
    scores = -lof.negative_outlier_factor_
    thr, labels = robust_threshold(scores, k=k, tail="high")
    return labels.astype(int), scores, thr


# =========================================================
# Hybrid scoring utilities
# =========================================================
def _robust_z_pos(x: np.ndarray) -> np.ndarray:
    """Right-tail robust z-score (>=0 when above median)."""
    x = np.asarray(x, dtype=np.float32)
    med = np.nanmedian(x).astype(np.float32)
    mad = np.nanmedian(np.abs(x - med)).astype(np.float32) + 1e-12
    z = (x - med) / (1.4826 * mad)
    return np.maximum(z, 0.0).astype(np.float32)  # only right tail


def _percentile01(x: np.ndarray) -> np.ndarray:
    """Map to [0,1] by robust percentiles (2‚Äì98). Values outside clamp."""
    x = np.asarray(x, dtype=np.float32)
    lo = np.nanpercentile(x, 2).astype(np.float32)
    hi = np.nanpercentile(x, 98).astype(np.float32)
    rng = max(hi - lo, 1e-12)
    y = (x - lo) / rng
    return np.clip(y, 0.0, 1.0).astype(np.float32)


def compute_hybrid_score_on_mask(df: pd.DataFrame, cfg: dict, mask_idx) -> np.ndarray:
    """
    Compute hybrid only on valid rows (mask_idx). Returns array the size of df,
    NaN elsewhere. Requires >= min_components present.
    """
    out = np.full(len(df), np.nan, dtype=np.float32)
    if not cfg.get("hybrid", {}).get("enabled", False):
        return out

    # Weights guard
    wmap = cfg.get("hybrid", {}).get("weights")
    if not isinstance(wmap, dict) or not wmap:
        wmap = {"iso_score": 0.25, "lof_score": 0.25, "ae_error": 0.25, "lstm_error": 0.25}

    method = cfg["hybrid"].get("method", "robust_z")
    min_components = int(cfg["hybrid"].get("min_components", 2))

    use = df.loc[mask_idx]  # restrict to valid feature rows

    comps = [c for c in ["iso_score", "lof_score", "ae_error", "lstm_error"] if c in use.columns and c in wmap]
    if not comps:
        return out

    parts = []
    for c in comps:
        arr = use[c].to_numpy(dtype=np.float32)
        if method == "robust_z":
            norm = _robust_z_pos(arr)
            norm = np.clip(norm, 0, 10.0).astype(np.float32) / 10.0  # ~[0,1]
        else:
            norm = _percentile01(arr)
        parts.append((norm, float(wmap[c])))

    num = np.zeros(len(use), dtype=np.float32)
    den = np.zeros(len(use), dtype=np.float32)
    present = np.zeros(len(use), dtype=np.int32)

    for norm, w in parts:
        m = ~np.isnan(norm)
        num[m] += w * norm[m]
        den[m] += w
        present[m] += 1

    ok = (den > 0) & (present >= min_components)
    hybrid_local = np.full(len(use), np.nan, dtype=np.float32)
    hybrid_local[ok] = (num[ok] / den[ok]).astype(np.float32)

    out[np.asarray(mask_idx)] = hybrid_local
    return out


# =========================================================
# Voting, selection rules, episodes, explanations
# =========================================================
def generate_votes(df: pd.DataFrame) -> pd.DataFrame:
    df["agreement_all_4"] = (
        (df.get("ae_is_anomaly", 0) == 1)
        & (df.get("is_anomaly", 0) == 1)
        & (df.get("lof_is_anomaly", 0) == 1)
        & (df.get("lstm_is_anomaly", 0) == 1)
    ).astype(int)
    df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
    df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
    return df  # note: vote-any intentionally removed


def extract_selected_rows(df: pd.DataFrame, rule: str = "hybrid") -> pd.DataFrame:
    if rule == "hybrid":
        mask = df.get("hybrid_is_anomaly", 0) == 1
    elif rule == "vote_3plus":
        mask = df.get("vote_3plus", 0) == 1
    elif rule == "agreement_all_4":
        mask = df.get("agreement_all_4", 0) == 1
    elif rule == "hybrid_or_vote3p":
        mask = (df.get("hybrid_is_anomaly", 0) == 1) | (df.get("vote_3plus", 0) == 1)
    elif rule == "hybrid_and_vote3p":
        mask = (df.get("hybrid_is_anomaly", 0) == 1) & (df.get("vote_3plus", 0) == 1)
    else:
        raise ValueError(f"Unknown selection rule: {rule}")
    return df.loc[mask].copy()


def _group_runs(idxs: np.ndarray, min_gap: int = 1) -> List[Tuple[int, int]]:
    if len(idxs) == 0:
        return []
    runs, start, prev = [], int(idxs[0]), int(idxs[0])
    for i in idxs[1:]:
        if int(i) - prev <= min_gap:
            prev = int(i)
            continue
        runs.append((start, prev))
        start = int(i); prev = int(i)
    runs.append((start, prev))
    return runs


def summarize_episodes(selected_df: pd.DataFrame, min_gap: int = 1) -> pd.DataFrame:
    """
    Build episodes PER FILE from the selected rows (hybrid/vote rule).
    """
    if selected_df.empty:
        cols = ["source_file", "start_idx", "end_idx", "length", "n_models_mean"]
        return pd.DataFrame(columns=cols)

    rows = []
    groups = selected_df.groupby("source_file") if "source_file" in selected_df.columns else [("", selected_df)]

    for sf, g in groups:
        idxs = g.index.to_numpy()
        if idxs.size == 0:
            continue
        runs = _group_runs(idxs, min_gap=min_gap)
        for start, end in runs:
            chunk = g.loc[start:end]
            row = {
                "source_file": sf,
                "start_idx": int(start),
                "end_idx": int(end),
                "length": int(end - start + 1),
                "n_models_mean": float(
                    chunk[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1).mean()
                ),
            }
            for c in ["iso_score", "ae_error", "lof_score", "lstm_error", "hybrid_score"]:
                if c in chunk.columns:
                    row[f"{c}_max"] = float(chunk[c].max())
                    row[f"{c}_mean"] = float(chunk[c].mean())
            rows.append(row)

    return pd.DataFrame(rows)


def _base_residual_columns(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def _models_string(chunk: pd.DataFrame) -> str:
    model_cols = [c for c in ["is_anomaly", "ae_is_anomaly", "lof_is_anomaly", "lstm_is_anomaly", "hybrid_is_anomaly"] if c in chunk.columns]
    if not model_cols:
        return "no-model-flags"
    means = chunk[model_cols].mean()
    active = [m.replace("_is_anomaly", "").upper() for m, v in means.items() if v >= 0.5]
    return ", ".join(active) if active else "weak/isolated flags"


def attach_episode_reasons(
    combined_df: pd.DataFrame, episodes_df: pd.DataFrame, top_k: int = 1
) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df

    base_res = _base_residual_columns(combined_df)
    if not base_res:
        episodes_df["primary_signal"] = ""
        episodes_df["reason"] = "no residual columns present"
        episodes_df["suspected_sensor"] = ""
        return episodes_df

    out = []
    for _, epi in episodes_df.iterrows():
        start, end = int(epi["start_idx"]), int(epi["end_idx"])
        mask = combined_df["source_file"] == epi["source_file"] if "source_file" in combined_df.columns else slice(None)
        chunk = combined_df.loc[mask].loc[start:end]

        if chunk.empty:
            epi["primary_signal"] = ""
            epi["reason"] = "empty slice"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats = []
        for col in base_res:
            if col in chunk.columns:
                stats.append((col, float(chunk[col].abs().max())))
        if not stats:
            epi["primary_signal"] = ""
            epi["reason"] = "no residual stats"
            epi["suspected_sensor"] = ""
            out.append(epi)
            continue

        stats.sort(key=lambda x: x[1], reverse=True)
        primary_signal, primary_val = stats[:top_k][0]
        models_str = _models_string(chunk)
        measured_col = primary_signal.replace("Residual", "Measured")
        suspected = measured_col if (measured_col in combined_df.columns) else "unknown-measured-sensor"

        epi["primary_signal"] = primary_signal
        epi["reason"] = f"max |{primary_signal}| = {primary_val:.3f}; models: {models_str}"
        epi["suspected_sensor"] = suspected
        out.append(epi)

    return pd.DataFrame(out)


# =========================================================
# Hardware mapping + root cause scoring
# =========================================================
HARDWARE_MAP = [
    ("Force_",         "Actuator/LoadCell",  "Force didn‚Äôt follow demand ‚Üí friction/lag/saturation/load-cell drift likely"),
    ("Encoder_",       "Encoders/Alignment", "Pose/velocity mismatch ‚Üí quantization/missing counts/misalignment"),
    ("Accelerometer_", "IMU/Accelerometer",  "Vibration bursts ‚Üí mounting/looseness/thermal drift"),
    ("State_",         "Control/Timing",     "Requested vs achieved state diverged ‚Üí scheduler limits/controller windup"),
]

def map_signal_to_hardware(primary_signal: str):
    for needle, hw, why in HARDWARE_MAP:
        if needle in primary_signal:
            return hw, why
    return "Unknown", "No mapping rule matched"


def enrich_hardware_mapping(episodes_df: pd.DataFrame) -> pd.DataFrame:
    if episodes_df.empty:
        return episodes_df
    episodes_df = episodes_df.copy()
    episodes_df["hardware_class"] = ""
    episodes_df["hardware_why"] = ""
    for i, r in episodes_df.iterrows():
        hw, why = map_signal_to_hardware(r.get("primary_signal", ""))
        episodes_df.at[i, "hardware_class"] = hw
        episodes_df.at[i, "hardware_why"]   = why
    return episodes_df


def _paired_columns(primary_signal: str, cfg: dict) -> Tuple[Optional[str], Optional[str]]:
    resid_tok  = cfg["signals"]["residual_token"]
    demand_tok = cfg["signals"]["demand_token"]
    measured_tok = cfg["signals"]["measured_token"]
    if resid_tok not in primary_signal:
        return None, None
    demand_col   = primary_signal.replace(resid_tok, demand_tok)
    measured_col = primary_signal.replace(resid_tok, measured_tok)
    return demand_col, measured_col


def _nan_ok(arr: np.ndarray) -> np.ndarray:
    return np.asarray(arr, dtype=float)


def _cross_correlation_lag(x: np.ndarray, y: np.ndarray, sample_rate_hz: Optional[float]) -> Tuple[float, int]:
    x = _nan_ok(x); y = _nan_ok(y)
    if len(x) != len(y) or len(x) == 0:
        return (np.nan, 0)
    x = x - np.nanmean(x); y = y - np.nanmean(y)
    x = np.nan_to_num(x);  y = np.nan_to_num(y)
    corr = np.correlate(x, y, mode="full")
    lags = np.arange(-len(x)+1, len(x))
    k = int(np.argmax(corr))
    lag_samples = int(lags[k])
    lag_seconds = lag_samples / sample_rate_hz if sample_rate_hz and sample_rate_hz > 0 else np.nan
    return (lag_seconds, lag_samples)


def _saturation_score(demand: np.ndarray, residual: np.ndarray, cfg: dict) -> float:
    if len(demand) == 0 or len(residual) == 0:
        return 0.0
    p_dem = np.nanpercentile(demand, cfg["scores"]["saturation_pct"])
    p_res = np.nanpercentile(np.abs(residual), cfg["scores"]["resid_prominence_pct"])
    near_limit = demand >= p_dem
    large_res  = np.abs(residual) >= p_res
    both = np.logical_and(near_limit, large_res)
    return float(np.nansum(both)) / max(1, len(demand))


def _drift_score(residual: np.ndarray) -> float:
    residual = _nan_ok(residual)
    mu = float(np.nanmean(residual))
    sd = float(np.nanstd(residual)) + 1e-9
    return abs(mu) / sd


def _vibration_score(signal: np.ndarray, sample_rate_hz: Optional[float]) -> float:
    if not sample_rate_hz or sample_rate_hz <= 0 or len(signal) < 8:
        return np.nan
    sig = np.nan_to_num(signal - np.nanmean(signal))
    fft = np.fft.rfft(sig)
    power = np.abs(fft) ** 2
    freqs = np.fft.rfftfreq(len(sig), d=1.0 / sample_rate_hz)
    if len(freqs) == 0:
        return np.nan
    cutoff = 0.25 * (sample_rate_hz / 2.0)  # > Nyquist/4
    mask_hi = freqs >= cutoff
    num = float(np.nansum(power[mask_hi]))
    den = float(np.nansum(power) + 1e-12)
    return num / den


def score_episodes(combined_df: pd.DataFrame, episodes_df: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    """
    Adds: lag_seconds, lag_samples, saturation_score, drift_score, vibe_score
    """
    if episodes_df.empty:
        return episodes_df
    out = episodes_df.copy()
    sr = cfg["signals"]["sample_rate_hz"]
    min_len = cfg["scores"]["min_window_len"]

    if "primary_signal" not in out.columns:
        out["primary_signal"] = ""

    for i, r in out.iterrows():
        start, end = int(r["start_idx"]), int(r["end_idx"])
        if end - start + 1 < min_len:
            out.at[i, "lag_seconds"] = np.nan
            out.at[i, "lag_samples"] = 0
            out.at[i, "saturation_score"] = 0.0
            out.at[i, "drift_score"] = 0.0
            out.at[i, "vibe_score"] = np.nan
            continue

        if "source_file" in combined_df.columns and "source_file" in out.columns and "source_file" in r:
            chunk = combined_df.loc[(combined_df["source_file"] == r["source_file"])].loc[start:end]
        else:
            chunk = combined_df.loc[start:end]

        primary = r.get("primary_signal", "")
        demand_col, measured_col = _paired_columns(primary, cfg)

        resid = chunk[primary].values if (primary in chunk.columns) else np.array([])
        dem   = chunk[demand_col].values if (demand_col and demand_col in chunk.columns) else np.array([])
        meas  = chunk[measured_col].values if (measured_col and measured_col in chunk.columns) else np.array([])

        lag_s, lag_k = _cross_correlation_lag(dem, meas, sr) if (len(dem) and len(meas)) else (np.nan, 0)
        sat_sc = _saturation_score(dem, resid, cfg) if (len(dem) and len(resid)) else 0.0
        dr_sc  = _drift_score(resid) if len(resid) else 0.0
        if "Accelerometer_" in primary and primary in chunk.columns:
            vibe_sc = _vibration_score(chunk[primary].values, sr)
        else:
            vibe_sc = _vibration_score(resid, sr)

        out.at[i, "lag_seconds"]       = lag_s
        out.at[i, "lag_samples"]       = int(lag_k)
        out.at[i, "saturation_score"]  = float(sat_sc)
        out.at[i, "drift_score"]       = float(dr_sc)
        out.at[i, "vibe_score"]        = float(vibe_sc) if vibe_sc == vibe_sc else np.nan
    return out


# =========================================================
# Plotting helpers (per-file overlays for selected rows)
# =========================================================
def _pick_residual(df: pd.DataFrame) -> Optional[str]:
    cand = [c for c in df.columns if "Residual" in c and not any(t in c for t in ["_delta", "_rolling_"])]
    return cand[0] if cand else None


def _slice_by_global_index(sub: pd.DataFrame, start: int, end: int, pad: int = 100) -> pd.DataFrame:
    """
    Robust slice of `sub` (filtered to one file) using global indices [start, end],
    expanded by `pad` points on both sides. Works even when index has gaps.
    """
    if sub.empty:
        return sub
    idx = sub.index.to_numpy()
    i0 = np.searchsorted(idx, start, side="left")
    i1 = np.searchsorted(idx, end,   side="right")
    i0 = max(i0 - pad, 0)
    i1 = min(i1 + pad, len(idx))
    return sub.iloc[i0:i1]


def plot_selected_for_file(
    df_file: pd.DataFrame,
    out_dir: str,
    rule: str,
    min_gap: int,
    figsize: Tuple[int, int] = (12, 5),
) -> Optional[str]:
    ensure_dir(out_dir)
    residual_col = _pick_residual(df_file)
    if residual_col is None:
        print("‚ö†Ô∏è No residual column to plot.")
        return None

    selected_rows = extract_selected_rows(df_file, rule=rule)
    episodes = summarize_episodes(selected_rows, min_gap=min_gap)

    plt.figure(figsize=figsize)
    plt.plot(df_file.index, df_file[residual_col], label=residual_col, alpha=0.85)

    if not selected_rows.empty:
        plt.scatter(selected_rows.index, selected_rows[residual_col], s=12, label=f"Selected anomalies ({rule})")

    if not episodes.empty:
        for _, r in episodes.iterrows():
            plt.axvspan(r["start_idx"], r["end_idx"], alpha=0.15, label="Episode")
        handles, labels = plt.gca().get_legend_handles_labels()
        uniq, seen = [], set()
        for h, l in zip(handles, labels):
            if l not in seen:
                uniq.append((h, l)); seen.add(l)
        handles, labels = zip(*uniq)
        plt.legend(handles, labels)
    else:
        plt.legend()

    sf = df_file["source_file"].iloc[0] if "source_file" in df_file.columns else "file"
    plt.title(f"{sf} ‚Äî Residual with selected anomalies & episodes")
    plt.xlabel("Index"); plt.ylabel(residual_col)
    plt.tight_layout()
    out_path = os.path.join(out_dir, f"selected_plot_{safe_name(sf)}.png")
    plt.savefig(out_path, dpi=160); plt.close()
    return out_path


def plot_all_files(combined_df: pd.DataFrame, out_dir: str, rule: str, min_gap: int, max_files: Optional[int] = None):
    paths = []
    if "source_file" not in combined_df.columns:
        print("‚ö†Ô∏è combined_df missing 'source_file'.")
        return paths
    groups = list(combined_df.groupby("source_file"))
    if max_files is not None:
        groups = groups[:max_files]
    for fname, df_file in groups:
        p = plot_selected_for_file(df_file, out_dir=out_dir, rule=rule, min_gap=min_gap)
        if p:
            paths.append(p); print(f"üñºÔ∏è Saved: {p}")
    if not paths:
        print("‚ö†Ô∏è No plots produced.")
    return paths


# =========================================================
# Sensor attribution, clustering & heatmap
# =========================================================
def _residual_cols_base(df: pd.DataFrame) -> List[str]:
    return [
        c for c in df.columns
        if ("Residual" in c) and not any(tag in c for tag in ["_delta", "_rolling_mean", "_rolling_std"])
    ]


def build_sensor_table(
    combined: pd.DataFrame,
    selected_rows: pd.DataFrame,
    episodes_with_reasons: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    base_res = _residual_cols_base(combined)
    if not base_res:
        return pd.DataFrame()

    total_rows = len(combined)
    sel_mask = pd.Series(False, index=combined.index)
    if not selected_rows.empty:
        sel_mask.loc[selected_rows.index] = True

    rows = []
    expected_keys = [
        "anomaly_rate_is",
        "anomaly_rate_ae",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "anomaly_rate_hybrid",
        "anomaly_rate_vote3p",
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
        "episodes_as_primary",
    ]

    for col in base_res:
        stats = {"sensor": col}

        stats["anomaly_rate_is"]   = float(combined["is_anomaly"].sum())   / max(total_rows, 1) if "is_anomaly"   in combined.columns else 0.0
        stats["anomaly_rate_ae"]   = float(combined["ae_is_anomaly"].sum())/ max(total_rows, 1) if "ae_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lof"]  = float(combined["lof_is_anomaly"].sum())/max(total_rows, 1) if "lof_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_lstm"] = float(combined["lstm_is_anomaly"].fillna(0).sum())/max(total_rows, 1) if "lstm_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_hybrid"] = float(combined["hybrid_is_anomaly"].sum())/max(total_rows, 1) if "hybrid_is_anomaly" in combined.columns else 0.0
        stats["anomaly_rate_vote3p"] = float(combined["vote_3plus"].sum())/max(total_rows, 1) if "vote_3plus" in combined.columns else 0.0

        if col in combined.columns and sel_mask.any():
            vals = combined.loc[sel_mask, col].abs()
            stats["mean_abs_resid_voted"] = float(vals.mean()) if not vals.empty else 0.0  # (name kept for compatibility)
            stats["max_abs_resid_voted"]  = float(vals.max())  if not vals.empty else 0.0
        else:
            stats["mean_abs_resid_voted"] = 0.0
            stats["max_abs_resid_voted"]  = 0.0

        if episodes_with_reasons is not None and not episodes_with_reasons.empty and "primary_signal" in episodes_with_reasons.columns:
            stats["episodes_as_primary"] = int((episodes_with_reasons["primary_signal"] == col).sum())
        else:
            stats["episodes_as_primary"] = 0

        for k in expected_keys:
            stats.setdefault(k, 0.0)

        rows.append(stats)

    sensor_df = pd.DataFrame(rows)
    for c in sensor_df.columns:
        if c != "sensor":
            sensor_df[c] = sensor_df[c].fillna(0.0)
    return sensor_df


def cluster_sensors(
    sensor_df: pd.DataFrame,
    n_clusters: int = 3,
    random_state: int = 42,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    if sensor_df.empty or "sensor" not in sensor_df.columns:
        return sensor_df, np.empty((0, 2)), np.empty((0, 2))

    features = sensor_df.drop(columns=["sensor"]).to_numpy(dtype=np.float32)
    if features.shape[0] < n_clusters:
        n_clusters = max(1, features.shape[0])

    scaler = StandardScaler()
    Z = scaler.fit_transform(features)

    km = KMeans(n_clusters=n_clusters, n_init="auto", random_state=random_state)
    labels = km.fit_predict(Z)

    pca = PCA(n_components=2, random_state=random_state)
    Z2 = pca.fit_transform(Z)
    centers2 = pca.transform(km.cluster_centers_)

    out = sensor_df.copy()
    out["cluster"] = labels

    return out, Z2, centers2


def plot_sensor_bar_top(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metric: str = "episodes_as_primary",
    top_n: int = 15,
    title: Optional[str] = None,
) -> Optional[str]:
    if sensor_df.empty or metric not in sensor_df.columns:
        return None

    ensure_dir(out_dir)
    df = sensor_df.sort_values(metric, ascending=False).head(top_n)

    plt.figure(figsize=(12, 6))
    plt.bar(range(len(df)), df[metric])
    plt.xticks(range(len(df)), [s.replace("Force_", "F_") for s in df["sensor"]], rotation=60, ha="right")
    plt.ylabel(metric)
    plt.title(title or f"Top {top_n} sensors by {metric}")
    plt.tight_layout()

    path = os.path.join(out_dir, f"top_sensors_{metric}.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_clusters_scatter(
    sensor_df_with_cluster: pd.DataFrame,
    Z2: np.ndarray,
    centers2: np.ndarray,
    out_dir: str,
    title: str = "Sensor clusters (PCA of features)",
) -> Optional[str]:
    if sensor_df_with_cluster.empty or Z2.size == 0:
        return None

    ensure_dir(out_dir)
    plt.figure(figsize=(9, 7))

    clusters = sorted(sensor_df_with_cluster["cluster"].unique().tolist())
    for cl in clusters:
        mask = sensor_df_with_cluster["cluster"] == cl
        pts = Z2[mask.values]
        plt.scatter(pts[:, 0], pts[:, 1], label=f"cluster {cl}", alpha=0.8, s=36)

    if centers2.size:
        plt.scatter(centers2[:, 0], centers2[:, 1], marker="X", s=120, label="centers")

    try:
        top_lab = sensor_df_with_cluster.sort_values("episodes_as_primary", ascending=False).head(10).index
        for idx in top_lab:
            plt.text(Z2[idx, 0], Z2[idx, 1], sensor_df_with_cluster.loc[idx, "sensor"], fontsize=8)
    except Exception:
        pass

    plt.title(title)
    plt.xlabel("PCA-1"); plt.ylabel("PCA-2")
    plt.legend()
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_clusters_pca.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


def plot_sensor_heatmap(
    sensor_df: pd.DataFrame,
    out_dir: str,
    metrics: Optional[List[str]] = None,
    title: str = "Sensor anomaly fingerprint (rates & magnitudes)",
) -> Optional[str]:
    if sensor_df.empty:
        return None
    ensure_dir(out_dir)

    desired = [
        "anomaly_rate_vote3p",
        "anomaly_rate_hybrid",
        "anomaly_rate_ae",
        "anomaly_rate_is",
        "anomaly_rate_lof",
        "anomaly_rate_lstm",
        "mean_abs_resid_voted",
        "max_abs_resid_voted",
    ]
    if metrics is None:
        metrics = desired

    available = [m for m in metrics if m in sensor_df.columns]
    if not available:
        print("‚ö†Ô∏è No requested heatmap metrics are present in sensor_df. Skipping heatmap.")
        return None
    if len(available) < len(metrics):
        missing = [m for m in metrics if m not in sensor_df.columns]
        print(f"‚ÑπÔ∏è Skipping missing metrics in heatmap: {missing}")
    metrics = available

    key_rank = "episodes_as_primary" if "episodes_as_primary" in sensor_df.columns else metrics[0]
    keep = sensor_df.sort_values(key_rank, ascending=False).head(25)

    M = keep[metrics].to_numpy(dtype=np.float32)
    plt.figure(figsize=(12, 8))
    plt.imshow(M, aspect="auto")
    plt.colorbar()
    plt.yticks(range(len(keep)), keep["sensor"])
    plt.xticks(range(len(metrics)), metrics, rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()

    path = os.path.join(out_dir, "sensor_fingerprint_heatmap.png")
    plt.savefig(path, dpi=160); plt.close()
    return path


# =========================================================
# Report (adds hybrid histogram page)
# =========================================================
def _overlay_episode_plot(df: pd.DataFrame, episode_row: pd.Series, cfg: dict, ax=None):
    start, end = int(episode_row["start_idx"]), int(episode_row["end_idx"])
    primary = episode_row.get("primary_signal", "")
    demand_col, measured_col = _paired_columns(primary, cfg)
    pad = int(cfg.get("report", {}).get("pad_points", 100))

    if ax is None:
        ax = plt.gca()

    # Filter to the right file first
    if "source_file" in df.columns and "source_file" in episode_row:
        df = df.loc[df["source_file"] == episode_row["source_file"]]

    window = _slice_by_global_index(df, start, end, pad=pad)
    if window.empty:
        ax.set_title(f"Episode {start}‚Äì{end} (EMPTY SLICE)")
        return

    t = window.index.to_numpy()

    if primary in window.columns:
        ax.plot(t, window[primary].values, label=f"{primary}", alpha=0.9)
    if demand_col and demand_col in window.columns:
        ax.plot(t, window[demand_col].values, label=f"{demand_col}", alpha=0.8)
    if measured_col and measured_col in window.columns:
        ax.plot(t, window[measured_col].values, label=f"{measured_col}", alpha=0.8)

    ax.axvspan(start, end, alpha=0.15, label="episode window")
    ax.set_xlabel("Index")
    ax.set_title(f"Episode {start}‚Äì{end}\nprimary={primary}")
    ax.legend(loc="best")


def build_ops_report(
    combined: pd.DataFrame,
    summary: pd.DataFrame,
    sensor_df: pd.DataFrame,
    episodes_scored: pd.DataFrame,
    cfg: dict,
    out_pdf_path: str
):
    ensure_dir(os.path.dirname(out_pdf_path))
    with PdfPages(out_pdf_path) as pdf:

        # Page 1 ‚Äî Anomaly counts by model/file
        plt.figure(figsize=(11, 6))
        plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"] if c in summary.columns]
        summary[plot_cols].plot(kind="bar")
        plt.title("Anomalies per Model per File")
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        pdf.savefig(); plt.close()

        # Page 2 ‚Äî Anomaly RATE (%) per model/file
        sizes = combined.groupby("source_file").size().rename("n_rows")
        summary_rates = summary.div(sizes, axis=0) * 100.0
        plt.figure(figsize=(11, 6))
        rate_cols = [c for c in plot_cols if c in summary_rates.columns]
        summary_rates[rate_cols].plot(kind="bar")
        plt.title("Anomaly RATE per Model per File (%)")
        plt.ylabel("Percent of rows (%)")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        pdf.savefig(); plt.close()

        # Page 3 ‚Äî Hybrid score histogram with threshold
        hs = combined.get("hybrid_score", pd.Series([], dtype="float32")).dropna().to_numpy(dtype=np.float32)
        if hs.size > 0:
            thr_vals = combined.get("hybrid_thr", pd.Series([], dtype="float32")).dropna().to_numpy(dtype=np.float32)
            thr = float(np.nanmedian(thr_vals)) if thr_vals.size > 0 else float(np.nanpercentile(hs, 99))
            plt.figure(figsize=(11, 6))
            plt.hist(hs, bins=60)
            plt.axvline(thr, linestyle="--", label=f"hybrid_thr‚âà{thr:.3f}")
            plt.title("Hybrid score distribution")
            plt.xlabel("hybrid_score"); plt.ylabel("count")
            plt.legend(); plt.tight_layout()
            pdf.savefig(); plt.close()

        # Page 4 ‚Äî Top sensors by episodes_as_primary
        p1 = plot_sensor_bar_top(sensor_df, out_dir=cfg["io"]["output_folder"], metric="episodes_as_primary", top_n=15,
                                 title="Top sensors by episodes_as_primary")
        if p1 and os.path.exists(p1):
            img = plt.imread(p1)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Page 5 ‚Äî Sensor heatmap (if created)
        p2 = plot_sensor_heatmap(sensor_df, out_dir=cfg["io"]["output_folder"])
        if p2 and os.path.exists(p2):
            img = plt.imread(p2)
            plt.figure(figsize=(11, 6)); plt.imshow(img); plt.axis("off")
            pdf.savefig(); plt.close()

        # Pages ‚Äî Episode overlays (per-file selection with caps)
        if not episodes_scored.empty:
            candidates = episodes_scored.copy()
            sort_keys = [c for c in ["n_models_mean", "hybrid_score_mean", "iso_score_mean", "ae_error_mean"] if c in candidates.columns]
            if sort_keys:
                candidates = candidates.sort_values(sort_keys, ascending=False)

            n_per_file = int(cfg.get("report", {}).get("top_n_per_file", 2))
            max_pages  = int(cfg.get("report", {}).get("max_pages", 12))

            pages = 0
            for sf, grp in candidates.groupby("source_file"):
                for _, epi in grp.head(n_per_file).iterrows():
                    if pages >= max_pages:
                        break
                    plt.figure(figsize=(11, 5))
                    _overlay_episode_plot(combined, epi, cfg, ax=plt.gca())
                    hw = epi.get("hardware_class", "Unknown")
                    why = epi.get("hardware_why", "")
                    lag_s = epi.get("lag_seconds", np.nan)
                    sat   = epi.get("saturation_score", np.nan)
                    drift = epi.get("drift_score", np.nan)
                    vibe  = epi.get("vibe_score", np.nan)
                    txt = (
                        f"hardware: {hw}\n"
                        f"why: {why}\n"
                        f"lag_seconds: {lag_s:.4f}  |  saturation: {sat:.3f}  |  drift: {drift:.3f}  |  vibe: {vibe:.3f}"
                    )
                    plt.gcf().text(0.02, 0.02, txt, ha="left", va="bottom", fontsize=9)
                    plt.tight_layout()
                    pdf.savefig(); plt.close()
                    pages += 1
                if pages >= max_pages:
                    break


# =========================================================
# Config (defaults or JSON)
# =========================================================
def default_config() -> dict:
    return {
        "io": {
            "input_folder": "./Datasets/Datasets",
            "residual_folder": "./Anomaly_detection/residual_created/",
            "output_folder": "./Anomaly_detection/code/outputs/"
        },
        "residuals": {
            "enabled": True,
            "demand_token": "Demand",
            "measured_token": "Measured",
            "residual_token": "Residual",
            "suffix": "_residual"
        },
        "features": {
            "window": 5,
            "max_features": 500
        },
        "threshold": {  # per-model MAD k
            "k": 3.5
        },
        "ae": {
            "epochs": 50,
            "lr": 0.001
        },
        "lstm": {
            "seq_len": 5,
            "hidden_dim": 64,
            "patience": 5,
            "max_sequences": 2000,
            "downsample": 10
        },
        "lof": {
            "n_neighbors": 20
        },
        "hybrid": {                     # Hybrid scoring config
            "enabled": True,
            "method": "robust_z",      # "robust_z" | "percentile"
            "min_components": 2,       # require at least N model scores present
            "weights": {               # relative importance
                "iso_score": 0.20,
                "lof_score": 0.20,
                "ae_error": 0.30,
                "lstm_error": 0.30
            }
        },
        "hybrid_threshold": {          # How to threshold hybrid_score
            "mode": "quantile",        # "mad" or "quantile"
            "k": 3.5,                  # used only if mode="mad"
            "quantile": 0.99           # top 1% as anomalies (fallback if MAD degenerates)
        },
        "selection": {
            "rule": "hybrid"           # "hybrid" | "vote_3plus" | "agreement_all_4" | "hybrid_or_vote3p" | "hybrid_and_vote3p"
        },
        "voting": {
            "rule": "vote_3plus",      # kept for diagnostics
            "min_gap": 1
        },
        "plots": {
            "enabled": True,
            "max_files": None,
            "emit_rate_plot": True
        },
        "runtime": {
            "use_float32": True,
            "downcast_dataframe": True
        },
        "limits": {                    # memory guards (rows in full CSV)
            "max_rows_lof": 80000,
            "max_rows_lstm": 120000,
            "max_rows_ae": 600000
        },
        "signals": {
            "sample_rate_hz": 100.0,     # set None if unknown
            "residual_token": "Residual",
            "demand_token": "Demand",
            "measured_token": "Measured"
        },
        "scores": {
            "saturation_pct": 95.0,
            "resid_prominence_pct": 95.0,
            "min_window_len": 5
        },
        "report": {
            "enabled": True,
            "top_n_episodes": 3,     # legacy; not used directly
            "top_n_per_file": 2,     # how many episodes per file to show
            "max_pages": 12,         # cap to avoid huge PDFs
            "pad_points": 100        # context on each side of an episode in plots
        }
    }


def load_config_from_path_or_default(path: Optional[str]) -> dict:
    if path and os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    print("‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.")
    return default_config()


# =========================================================
# Per-file processing & Pipeline
# =========================================================
def process_file(file_path: str, cfg: Dict, logger=print) -> Optional[pd.DataFrame]:
    df = pd.read_csv(file_path)
    if cfg.get("runtime", {}).get("downcast_dataframe", True):
        downcast_df_inplace(df, prefer_float32=True)

    file_name = os.path.basename(file_path).replace(".csv", "")

    residual_cols = [c for c in df.columns if "Residual" in c]
    if not residual_cols:
        logger(f"‚ùå Skipped {file_name}: No residuals found.")
        return None

    X, feature_cols, fe_stats = prepare_features(
        df, residual_cols,
        window=cfg["features"]["window"],
        max_features=cfg["features"]["max_features"],
        logger=logger,
    )
    if X is None or len(feature_cols) == 0 or X.empty:
        logger(f"‚ùå Skipped {file_name}: invalid or empty features")
        return None

    _, X_scaled, X_tensor = scale_features(X, use_float32=cfg["runtime"]["use_float32"])

    # Memory-aware limits
    n_rows = len(df)
    limits = cfg.get("limits", {})
    skip_lof  = n_rows > int(limits.get("max_rows_lof", 8e4))
    skip_lstm = n_rows > int(limits.get("max_rows_lstm", 1.2e5))
    skip_ae   = n_rows > int(limits.get("max_rows_ae", 6e5))

    # Isolation Forest
    iso_labels, iso_scores, iso_thr = isolation_forest_detect(X_scaled, k=cfg["threshold"]["k"])
    df.loc[X.index, "is_anomaly"] = iso_labels
    df.loc[X.index, "iso_score"] = iso_scores
    df.loc[X.index, "iso_thr"] = iso_thr

    # Dense AE (skip if huge)
    if not skip_ae:
        try:
            ae_labels, ae_errors, ae_thr = dense_autoencoder_detect(
                X_tensor, k=cfg["threshold"]["k"], ae_epochs=cfg["ae"]["epochs"], ae_lr=cfg["ae"]["lr"]
            )
            df.loc[X.index, "ae_is_anomaly"] = ae_labels
            df.loc[X.index, "ae_error"] = ae_errors
            df.loc[X.index, "ae_thr"] = ae_thr
        except MemoryError:
            print("‚ö†Ô∏è AE skipped due to memory.")
            df.loc[X.index, "ae_is_anomaly"] = 0
            df.loc[X.index, "ae_error"] = np.nan
            df.loc[X.index, "ae_thr"] = np.nan
    else:
        print(f"‚ÑπÔ∏è AE skipped (n_rows={n_rows} > limit).")
        df.loc[X.index, "ae_is_anomaly"] = 0
        df.loc[X.index, "ae_error"] = np.nan
        df.loc[X.index, "ae_thr"] = np.nan

    # LOF (skip if huge)
    if not skip_lof:
        try:
            lof_labels, lof_scores, lof_thr = lof_detect(X_scaled, k=cfg["threshold"]["k"], n_neighbors=cfg["lof"]["n_neighbors"])
            df.loc[X.index, "lof_is_anomaly"] = lof_labels
            df.loc[X.index, "lof_score"] = lof_scores
            df.loc[X.index, "lof_thr"] = lof_thr
        except MemoryError:
            print("‚ö†Ô∏è LOF skipped due to memory.")
            df.loc[X.index, "lof_is_anomaly"] = 0
            df.loc[X.index, "lof_score"] = np.nan
            df.loc[X.index, "lof_thr"] = np.nan
    else:
        print(f"‚ÑπÔ∏è LOF skipped (n_rows={n_rows} > limit).")
        df.loc[X.index, "lof_is_anomaly"] = 0
        df.loc[X.index, "lof_score"] = np.nan
        df.loc[X.index, "lof_thr"] = np.nan

    # LSTM AE (skip if huge)
    if not skip_lstm:
        try:
            lstm_labels, lstm_errors, lstm_idx, lstm_thr = lstm_autoencoder_detect(
                X_scaled,
                k=cfg["threshold"]["k"],
                seq_len=cfg["lstm"]["seq_len"],
                hidden_dim=cfg["lstm"]["hidden_dim"],
                patience=cfg["lstm"]["patience"],
                max_sequences=cfg["lstm"]["max_sequences"],
                downsample=cfg["lstm"]["downsample"],
            )
            if len(lstm_idx) > 0:
                df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
                df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
                df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
            else:
                df["lstm_is_anomaly"] = 0
                df["lstm_error"] = np.nan
                df["lstm_thr"] = np.nan
        except MemoryError:
            print("‚ö†Ô∏è LSTM-AE skipped due to memory.")
            df["lstm_is_anomaly"] = 0
            df["lstm_error"] = np.nan
            df["lstm_thr"] = np.nan
    else:
        print(f"‚ÑπÔ∏è LSTM-AE skipped (n_rows={n_rows} > limit).")
        df["lstm_is_anomaly"] = 0
        df["lstm_error"] = np.nan
        df["lstm_thr"] = np.nan

    # --- Hybrid score (weighted fusion on valid rows)
    mask_idx = X.index
    df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx).astype("float32")

    hs = df.loc[mask_idx, "hybrid_score"].to_numpy(dtype=np.float32)
    if np.isnan(hs).all():
        df.loc[mask_idx, "hybrid_is_anomaly"] = 0
        df.loc[mask_idx, "hybrid_thr"] = np.nan
    else:
        mode = cfg.get("hybrid_threshold", {}).get("mode", "mad")
        if mode == "quantile":
            q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
            thr = np.nanpercentile(hs, 100 * q)
            labels = (hs > thr).astype(int)
        else:
            thr, labels = robust_threshold(hs, k=cfg["hybrid_threshold"].get("k", 3.5), tail="high")
            # Fallback if too many positives (MAD degenerate)
            if np.nanmean(labels) > 0.5:
                q = float(cfg["hybrid_threshold"].get("quantile", 0.98))
                thr = np.nanpercentile(hs, 100 * q)
                labels = (hs > thr).astype(int)
        df.loc[mask_idx, "hybrid_is_anomaly"] = labels
        df.loc[mask_idx, "hybrid_thr"] = thr

    # Add votes (for diagnostics)
    df = generate_votes(df)

    df["source_file"] = file_name
    df["fe_reused"] = fe_stats.get("reused", 0)
    df["fe_generated"] = fe_stats.get("generated", 0)

    # free big buffers early
    del X_scaled, X_tensor
    gc.collect()

    logger(
        f"[{file_name}] iso={int(df['is_anomaly'].sum())} | "
        f"ae={int(df['ae_is_anomaly'].sum())} | "
        f"lof={int(df['lof_is_anomaly'].sum())} | "
        f"lstm={int(df['lstm_is_anomaly'].fillna(0).sum())} | "
        f"hyb={int(df['hybrid_is_anomaly'].sum())} | "
        f"vote3+={int(df['vote_3plus'].sum())}"
    )
    return df


def run_pipeline(cfg: Dict):
    logger = print

    # A) residuals (optional)
    if cfg["residuals"]["enabled"]:
        logger("üîß Creating residuals...")
        create_residuals_for_folder(
            in_folder=cfg["io"]["input_folder"],
            out_folder=cfg["io"]["residual_folder"],
            demand_token=cfg["residuals"]["demand_token"],
            measured_token=cfg["residuals"]["measured_token"],
            residual_token=cfg["residuals"]["residual_token"],
            skip_if_exists=True,
            suffix=cfg["residuals"]["suffix"],
            logger=logger,
        )
        data_folder = cfg["io"]["residual_folder"]
    else:
        data_folder = cfg["io"]["input_folder"]

    # B) per-file
    all_dfs = []
    for file in os.listdir(data_folder):
        if file.endswith(".csv"):
            out = process_file(os.path.join(data_folder, file), cfg, logger=logger)
            if out is not None:
                all_dfs.append(out)

    if not all_dfs:
        logger("‚ùå No files processed.")
        return

    combined = pd.concat(all_dfs, ignore_index=True)
    ensure_dir(cfg["io"]["output_folder"])

    combined_path = os.path.join(cfg["io"]["output_folder"], "combined_anomaly_results.csv")
    combined.to_csv(combined_path, index=False)

    # Summary (counts)
    cols = ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"]
    cols = [c for c in cols if c in combined.columns]
    summary = combined.groupby("source_file")[cols].sum()
    summary["total_anomalies"] = summary.sum(axis=1)
    summary_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_summary.csv")
    summary.to_csv(summary_path)

    logger(f"‚úÖ Saved row-level: {combined_path}")
    logger(f"‚úÖ Saved summary:   {summary_path}")

    # C) Counts plot
    plt.figure(figsize=(12, 6))
    plot_cols = [c for c in ["is_anomaly","ae_is_anomaly","lof_is_anomaly","lstm_is_anomaly","hybrid_is_anomaly","vote_3plus"] if c in summary.columns]
    summary[plot_cols].plot(kind="bar", figsize=(12, 6))
    plt.title("Anomalies per Model per File")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    bar_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_plot.png")
    plt.savefig(bar_path); plt.close()
    logger(f"üñºÔ∏è Saved: {bar_path}")

    # C2) Rate plot (% rows)
    if cfg.get("plots", {}).get("emit_rate_plot", True):
        sizes = combined.groupby("source_file").size().rename("n_rows")
        summary_rates = summary.div(sizes, axis=0) * 100.0
        plt.figure(figsize=(12, 6))
        rate_cols = [c for c in plot_cols if c in summary_rates.columns]
        summary_rates[rate_cols].plot(kind="bar", figsize=(12, 6))
        plt.title("Anomaly RATE per Model per File (%)")
        plt.ylabel("Percent of rows (%)")
        plt.xticks(rotation=45, ha="right")
        plt.tight_layout()
        rate_path = os.path.join(cfg["io"]["output_folder"], "model_comparison_rate_plot.png")
        plt.savefig(rate_path); plt.close()
        logger(f"üñºÔ∏è Saved: {rate_path}")

    # D) Selection + episodes + reasons (HYBRID drives by default)
    selection_rule = cfg.get("selection", {}).get("rule", "hybrid")
    selected_rows = extract_selected_rows(combined, rule=selection_rule)
    out_sel_dir = os.path.join(cfg["io"]["output_folder"], "selected_outputs")
    ensure_dir(out_sel_dir)
    selected_rows_path = os.path.join(out_sel_dir, f"selected_anomalies_rows_{selection_rule}.csv")
    selected_rows.to_csv(selected_rows_path, index=False)

    episodes = summarize_episodes(selected_rows, min_gap=cfg["voting"]["min_gap"])
    episodes_path = os.path.join(out_sel_dir, f"selected_anomaly_episodes_{selection_rule}.csv")
    episodes.to_csv(episodes_path, index=False)

    episodes_with_reasons = attach_episode_reasons(combined, episodes, top_k=1)
    episodes_with_reasons = enrich_hardware_mapping(episodes_with_reasons)
    episodes_scored = score_episodes(combined, episodes_with_reasons, cfg)

    episodes_reason_path = os.path.join(out_sel_dir, f"selected_anomaly_episodes_with_reasons_{selection_rule}.csv")
    episodes_scored_path = os.path.join(out_sel_dir, f"selected_anomaly_episodes_with_reasons_and_scores_{selection_rule}.csv")
    episodes_with_reasons.to_csv(episodes_reason_path, index=False)
    episodes_scored.to_csv(episodes_scored_path, index=False)
    logger(f"‚úÖ Saved episodes+reason: {episodes_reason_path}")
    logger(f"‚úÖ Saved episodes+scores: {episodes_scored_path}")

    print("\nEPISODES PER FILE (after selection & scoring):")
    if not episodes_scored.empty:
        print(episodes_scored.groupby("source_file").size().to_string())
    else:
        print("No episodes found under current selection rule.")

    # E) Per-file plots with selected overlays (optional)
    if cfg["plots"]["enabled"]:
        _ = plot_all_files(
            combined_df=combined,
            out_dir=out_sel_dir,
            rule=selection_rule,
            min_gap=cfg["voting"]["min_gap"],
            max_files=cfg["plots"]["max_files"],
        )

    # F) Sensor table + clustering visuals (use selected rows)
    sensor_df = build_sensor_table(combined, selected_rows, episodes_with_reasons=episodes_with_reasons)
    sensor_df_path = os.path.join(out_sel_dir, "sensor_table.csv")
    sensor_df.to_csv(sensor_df_path, index=False)
    logger(f"‚úÖ Saved sensor table: {sensor_df_path}")

    clustered, Z2, centers2 = cluster_sensors(sensor_df, n_clusters=3, random_state=42)
    _ = plot_sensor_clusters_scatter(clustered, Z2, centers2, out_dir=out_sel_dir)
    _ = plot_sensor_heatmap(sensor_df, out_dir=out_sel_dir)
    _ = plot_sensor_bar_top(sensor_df, out_dir=out_sel_dir, metric="episodes_as_primary", top_n=15)

    # G) PDF report
    if cfg.get("report", {}).get("enabled", True):
        pdf_path = os.path.join(cfg["io"]["output_folder"], "ops_report.pdf")
        build_ops_report(
            combined=combined,
            summary=summary,
            sensor_df=sensor_df,
            episodes_scored=episodes_scored,
            cfg=cfg,
            out_pdf_path=pdf_path
        )
        logger(f"üìÑ Ops report saved: {pdf_path}")


# =========================================================
# Entrypoint
# =========================================================
def main():
    parser = argparse.ArgumentParser(description="Anomaly Detection Product")
    parser.add_argument("--config", type=str, default=None, help="Path to config JSON")
    args, _ = parser.parse_known_args()  # allows notebook execution

    cfg = load_config_from_path_or_default(args.config)
    run_pipeline(cfg)


if __name__ == "__main__":
    main()


‚ÑπÔ∏è  No --config provided or not found. Using in-memory default config.
üîß Creating residuals...
‚Ü©Ô∏è  Skip residual (exists): Dataset01_Ski_CrossbeamYawNotPerforming_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset02_Matrix_Rocker4EncoderNotWorking_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset03_Wushu_YawTrapezoidNormal_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset04_Wushu_YawWaveletSqueak_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset05_Wushu_LaneChanges_ModelBump_residual.csv
‚ùå Failed to read Dataset07_Demo_Spa_GT.csv: No columns to parse from file
‚Ü©Ô∏è  Skip residual (exists): Dataset08_Demo_Jiggler_residual.csv
‚Ü©Ô∏è  Skip residual (exists): Dataset09_Demo_VerticalChirp_residual.csv
‚ùå Failed to read Dataset10_Demo_MillbrookHills.csv: No columns to parse from file


  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = d

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = 0
  df.loc[X.index, "lof_score"] = np.nan
  df.loc[X.index, "lof_thr"] = np.nan
  df["lstm_is_anomaly"] = 0
  df["lstm_error"] = np.nan
  df["lstm_thr"] = np.nan


‚ÑπÔ∏è LOF skipped (n_rows=180200 > limit).
‚ÑπÔ∏è LSTM-AE skipped (n_rows=180200 > limit).


  med = np.nanmedian(x).astype(np.float32)
  mad = np.nanmedian(np.abs(x - med)).astype(np.float32) + 1e-12
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx).astype("float32")
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generated"] = fe_stats.get("generated", 0)


[Dataset01_Ski_CrossbeamYawNotPerforming_residual] iso=6081 | ae=11394 | lof=0 | lstm=0 | hyb=1802 | vote3+=0
‚ùå Skipped Dataset02_Matrix_Rocker4EncoderNotWorking_residual: No residuals found.


  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = d

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = 0
  df.loc[X.index, "lof_score"] = np.nan


‚ÑπÔ∏è LOF skipped (n_rows=381474 > limit).
‚ÑπÔ∏è LSTM-AE skipped (n_rows=381474 > limit).


  df.loc[X.index, "lof_thr"] = np.nan
  df["lstm_is_anomaly"] = 0
  df["lstm_error"] = np.nan
  df["lstm_thr"] = np.nan
  med = np.nanmedian(x).astype(np.float32)
  mad = np.nanmedian(np.abs(x - med)).astype(np.float32) + 1e-12
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx).astype("float32")
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generated"] = fe_stats.get("generated", 0)


[Dataset03_Wushu_YawTrapezoidNormal_residual] iso=4543 | ae=15349 | lof=0 | lstm=0 | hyb=3815 | vote3+=0


  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = d

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = lof_labels
  df.loc[X.index, "lof_score"] = lof_scores
  df.loc[X.index, "lof_thr"] = lof_thr
  df.loc[df.index[lstm_idx], "lstm_is_anomaly"] = lstm_labels
  df.loc[df.index[lstm_idx], "lstm_error"] = lstm_errors
  df.loc[df.index[lstm_idx], "lstm_thr"] = lstm_thr
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx).astype("float32")
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)

[Dataset04_Wushu_YawWaveletSqueak_residual] iso=5912 | ae=5632 | lof=1179 | lstm=610 | hyb=190 | vote3+=663


  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = df[col].diff().astype("float32")
  df[f"{col}_rolling_mean_{window}"] = df[col].rolling(window=window).mean().astype("float32")
  df[f"{col}_rolling_std_{window}"] = df[col].rolling(window=window).std().astype("float32")
  df[f"{col}_delta"] = d

üõ†Ô∏è  Generated 168 features (window=5).


  df.loc[X.index, "is_anomaly"] = iso_labels
  df.loc[X.index, "iso_score"] = iso_scores
  df.loc[X.index, "iso_thr"] = iso_thr
  df.loc[X.index, "ae_is_anomaly"] = ae_labels
  df.loc[X.index, "ae_error"] = ae_errors
  df.loc[X.index, "ae_thr"] = ae_thr
  df.loc[X.index, "lof_is_anomaly"] = 0
  df.loc[X.index, "lof_score"] = np.nan
  df.loc[X.index, "lof_thr"] = np.nan
  df["lstm_is_anomaly"] = 0
  df["lstm_error"] = np.nan
  df["lstm_thr"] = np.nan


‚ÑπÔ∏è LOF skipped (n_rows=131200 > limit).
‚ÑπÔ∏è LSTM-AE skipped (n_rows=131200 > limit).


  med = np.nanmedian(x).astype(np.float32)
  mad = np.nanmedian(np.abs(x - med)).astype(np.float32) + 1e-12
  df["hybrid_score"] = compute_hybrid_score_on_mask(df, cfg, mask_idx).astype("float32")
  df.loc[mask_idx, "hybrid_is_anomaly"] = labels
  df.loc[mask_idx, "hybrid_thr"] = thr
  df["agreement_all_4"] = (
  df["num_votes"] = df[["ae_is_anomaly", "is_anomaly", "lof_is_anomaly", "lstm_is_anomaly"]].sum(axis=1)
  df["vote_3plus"] = (df["num_votes"] >= 3).astype(int)
  df["source_file"] = file_name
  df["fe_reused"] = fe_stats.get("reused", 0)
  df["fe_generated"] = fe_stats.get("generated", 0)


[Dataset05_Wushu_LaneChanges_ModelBump_residual] iso=7951 | ae=7437 | lof=0 | lstm=0 | hyb=1312 | vote3+=0
‚ùå Skipped Dataset08_Demo_Jiggler_residual: No residuals found.
‚ùå Skipped Dataset09_Demo_VerticalChirp_residual: No residuals found.


  combined = pd.concat(all_dfs, ignore_index=True)


‚úÖ Saved row-level: ./Anomaly_detection/code/outputs/combined_anomaly_results.csv
‚úÖ Saved summary:   ./Anomaly_detection/code/outputs/model_comparison_summary.csv
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/model_comparison_plot.png
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/model_comparison_rate_plot.png
‚úÖ Saved episodes+reason: ./Anomaly_detection/code/outputs/selected_outputs\selected_anomaly_episodes_with_reasons_hybrid.csv
‚úÖ Saved episodes+scores: ./Anomaly_detection/code/outputs/selected_outputs\selected_anomaly_episodes_with_reasons_and_scores_hybrid.csv

EPISODES PER FILE (after selection & scoring):
source_file
Dataset01_Ski_CrossbeamYawNotPerforming_residual     392
Dataset03_Wushu_YawTrapezoidNormal_residual         1015
Dataset04_Wushu_YawWaveletSqueak_residual             89
Dataset05_Wushu_LaneChanges_ModelBump_residual       622
üñºÔ∏è Saved: ./Anomaly_detection/code/outputs/selected_outputs\selected_plot_Dataset01_Ski_CrossbeamYawNotPerforming_residual.



üìÑ Ops report saved: ./Anomaly_detection/code/outputs/ops_report.pdf


<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1100x600 with 0 Axes>

<Figure size 1100x600 with 0 Axes>