# Anomaly Detection
Detect downtime anomalies using STL residuals + robust z-score and export alerts.csv.

In [1]:
from __future__ import annotations
import pandas as pd
from pathlib import Path
from typing import Optional

DATA_DIR = Path("Data")

# Expected columns in operations_daily: date, site_id, units_produced, power_kwh, downtime_minutes (optional)
# Expected columns in site_meta: site_id, region, capacity, ...

def read_operations(path: Optional[str] = None, data_dir: Path = DATA_DIR) -> pd.DataFrame:
    REQUIRED_COLS = {"date", "site_id", "units_produced", "power_kwh"}

    def _load(p: Path) -> pd.DataFrame:
        df = pd.read_csv(p, parse_dates=["date"])  # type: ignore[arg-type]
        if not REQUIRED_COLS.issubset(df.columns):
            raise ValueError(f"{p.name} missing required columns: {REQUIRED_COLS - set(df.columns)}")
        return df.sort_values(["site_id", "date"]).reset_index(drop=True)

    if path:
        return _load(Path(path))

    candidates = sorted(data_dir.glob("operations_daily_*.csv"))
    if candidates:
        def _days(p: Path) -> int:
            name = p.stem
            token = name.split("_")[-1]
            return int(token[:-1]) if token.endswith("d") and token[:-1].isdigit() else 0
        chosen = max(candidates, key=_days)
        return _load(chosen)

    fallback = data_dir / "operations_daily.csv"
    if fallback.exists():
        return _load(fallback)

    raise FileNotFoundError("No operations_daily CSV found in Data/")

def read_site_meta(path: Optional[str] = None) -> pd.DataFrame:
    p = Path(path) if path else DATA_DIR / "site_meta.csv"
    if not p.exists():
        raise FileNotFoundError(f"site_meta not found at {p}")
    df = pd.read_csv(p)
    return df

In [2]:
from __future__ import annotations
import numpy as np
import pandas as pd
from typing import List

# Interpretable anomaly detection using STL residuals + robust z-score
try:
    from statsmodels.tsa.seasonal import STL
    HAS_STL = True
except Exception:
    HAS_STL = False


def robust_zscore(x: np.ndarray) -> np.ndarray:
    x = np.asarray(x)
    med = np.median(x)
    mad = np.median(np.abs(x - med)) + 1e-9
    return 0.6745 * (x - med) / mad


def stl_residuals(series: pd.Series, period: int = 7) -> np.ndarray:
    if not HAS_STL or series.isna().all() or len(series) < period * 2:
        # Fallback residuals = series - rolling mean
        mu = series.rolling(period, min_periods=max(2, period//2)).mean()
        return (series - mu).fillna(0).values
    s = series.fillna(method="ffill").fillna(method="bfill")
    stl = STL(s, period=period, robust=True)
    res = stl.fit()
    return (s - res.trend - res.seasonal).values


def detect_anomalies(df: pd.DataFrame, targets: List[str] = ["units_produced", "power_kwh"],
                     period: int = 7, z_thresh: float = 3.0) -> pd.DataFrame:
    alerts = []
    for site_id, g in df.sort_values(["site_id", "date"]).groupby("site_id"):
        for tgt in targets:
            if tgt not in g.columns:
                continue
            resid = stl_residuals(g[tgt], period=period)
            z = robust_zscore(resid)
            mask = np.abs(z) >= z_thresh
            if mask.any():
                sel = g.loc[mask, ["date", tgt]].copy()
                for _, r in sel.iterrows():
                    idx = g.index.get_loc(_)
                    alerts.append({
                        "site_id": site_id,
                        "date": r["date"],
                        "metric": tgt,
                        "observed": float(r[tgt]),
                        "expected": float(g[tgt].iloc[idx] - resid[idx]),
                        "residual": float(resid[idx]),
                        "anomaly_score": float(z[idx]),
                        "rule": f"|z|>={z_thresh} via STL residuals",
                    })
    return pd.DataFrame(alerts)

In [3]:
import pandas as pd

ops = read_operations()
alerts = detect_anomalies(ops)
alerts.to_csv('outputs/alerts.csv', index=False)
alerts.head()

Unnamed: 0,site_id,date,metric,observed,expected,residual,anomaly_score,rule
0,S1,2025-01-05,units_produced,0.0,1008.0,-1008.0,-6.336392,|z|>=3.0 via STL residuals
1,S1,2025-01-18,units_produced,0.0,1073.142857,-1073.142857,-6.723763,|z|>=3.0 via STL residuals
2,S1,2025-01-23,units_produced,0.0,874.285714,-874.285714,-5.541264,|z|>=3.0 via STL residuals
3,S1,2025-03-25,units_produced,0.0,1070.0,-1070.0,-6.705074,|z|>=3.0 via STL residuals
4,S1,2025-03-29,units_produced,0.0,867.428571,-867.428571,-5.500488,|z|>=3.0 via STL residuals
