<a href="https://colab.research.google.com/github/TipsyPanda/ComplexBridges/blob/main/IPMB_Anomaly_Starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IPMB – ML Anomaly Detection Starter Notebook
*Generated on 2025-10-13 13:12:05*

This notebook covers:
1. Data loading & basic EDA
2. Rolling-window feature engineering
3. Unsupervised anomaly detection (Isolation Forest)
4. Threshold calibration & evaluation (time-aware)
5. Minimal, stream-friendly scoring loop


## 1) Setup

In [13]:

# Install (if running elsewhere) and imports
# %pip install pandas numpy scikit-learn matplotlib scipy

import os
import json
import math
import numpy as np
import pandas as pd
from datetime import timedelta
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_recall_fscore_support, average_precision_score
import matplotlib.pyplot as plt

# Matplotlib defaults (no explicit colors)
plt.rcParams["figure.figsize"] = (10, 4)

from google.colab import drive
drive.mount('/content/drive/')
# Paths
DATA_PATH = "/content/drive/MyDrive/ComplexSystemDesign/Data/"  # change if needed

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## 2) Load & sanity checks

In [19]:

# Read
df = pd.read_pickle(DATA_PATH+"DF_BridgeData_cleaned.pkl")
print(df.shape)
df.head()
df.info()


(90000, 10)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 90000 entries, 2025-10-10 12:00:00 to 2025-10-10 12:29:59.900000
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bridge_id           90000 non-null  object 
 1   span_id             90000 non-null  object 
 2   sensor_id           90000 non-null  object 
 3   sensor_type         90000 non-null  object 
 4   value               90000 non-null  float64
 5   unit                90000 non-null  object 
 6   traffic_load_proxy  90000 non-null  float64
 7   rule_threshold      90000 non-null  float64
 8   anomaly             90000 non-null  int64  
 9   anomaly_type        3852 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 7.6+ MB


In [None]:
df = df.sort_values('timestamp').reset_index(drop=True)
# Drop rows without timestamp
df = df[~df['timestamp'].isna()].copy()
# Quick look

## 3) Basic EDA (per sensor_type / sensor_id)

In [None]:

# Choose a single stream to develop on (best: per sensor_id × span_id)
# You can change these two lines to target a different stream.
example_type = df['sensor_type'].dropna().unique()[0]
example_sensor = df.loc[df['sensor_type']==example_type, 'sensor_id'].dropna().unique()[0]

sub = df[(df['sensor_type']==example_type) & (df['sensor_id']==example_sensor)].copy()
sub = sub.sort_values('timestamp').reset_index(drop=True)

print("Working stream:", example_type, example_sensor, "rows:", len(sub))

# Plot raw value + rule flags for a quick feel
fig = plt.figure()
plt.plot(sub['timestamp'], sub['value'], label='value')
plt.title(f"Raw values – {example_sensor} ({example_type})")
plt.xlabel("time"); plt.ylabel("value")
plt.show()

# Overlay anomaly flags (rule-based)
if 'anomaly' in sub.columns:
    fig = plt.figure()
    plt.plot(sub['timestamp'], sub['value'])
    # annotate anomalies by vertical lines
    t_anom = sub.loc[sub['anomaly']==1, 'timestamp']
    for t in t_anom:
        plt.axvline(t, linestyle='--', linewidth=0.8)
    plt.title(f"Rule anomalies overlay – {example_sensor}")
    plt.xlabel("time"); plt.ylabel("value")
    plt.show()


## 4) Rolling-window feature engineering

In [None]:

# Helper: generate rolling-window features for a single stream
def make_rolling_features(ts: pd.DataFrame, window='30s', step='15s'):
    # Ensure fixed frequency via resampling (forward-fill short gaps)
    ts = ts.set_index('timestamp').sort_index()
    # Infer a sensible base frequency
    base_freq = pd.infer_freq(ts.index)
    if base_freq is None:
        # fallback: use median delta
        deltas = ts.index.to_series().diff().dropna().dt.total_seconds()
        nominal = f"{int(np.median(deltas))}S"
        base_freq = nominal
    ts = ts.resample(base_freq).ffill()

    # Create rolling windows: we emulate a sliding approach by stepping
    start = ts.index.min()
    end = ts.index.max()
    out_rows = []
    cur = start
    while cur + pd.to_timedelta(window) <= end:
        win = ts.loc[cur:cur+pd.to_timedelta(window)]
        if len(win) < 3:
            cur += pd.to_timedelta(step)
            continue
        v = win['value'].values

        # time-domain stats
        mean = np.mean(v)
        std = np.std(v, ddof=1) if len(v) > 1 else 0.0
        rms = math.sqrt(np.mean(v**2))
        p2p = np.max(v) - np.min(v)
        skew = pd.Series(v).skew()
        kurt = pd.Series(v).kurtosis()

        # trend (slope) via simple linear regression on index steps
        x = np.arange(len(v))
        if len(v) > 1:
            slope = np.polyfit(x, v, 1)[0]
        else:
            slope = 0.0

        # diffs
        dv = np.diff(v)
        adiff_mean = np.mean(np.abs(dv)) if len(dv) else 0.0

        # context features (if present)
        ctx = {}
        if 'traffic_load_proxy' in win.columns:
            tl = win['traffic_load_proxy'].values
            ctx['ctx_tl_mean'] = float(np.mean(tl))
            ctx['ctx_tl_std'] = float(np.std(tl, ddof=1)) if len(tl)>1 else 0.0
        if 'rule_threshold' in win.columns:
            ctx['ctx_rule_thr'] = float(np.median(win['rule_threshold']))

        # label (weak): any rule anomaly in the window -> 1
        label = None
        if 'anomaly' in win.columns:
            label = int((win['anomaly'] == 1).any())
        # window center time
        t_center = cur + (pd.to_timedelta(window)/2)

        row = {
            't_center': t_center,
            'f_mean': float(mean),
            'f_std': float(std),
            'f_rms': float(rms),
            'f_p2p': float(p2p),
            'f_skew': float(skew) if not np.isnan(skew) else 0.0,
            'f_kurt': float(kurt) if not np.isnan(kurt) else 0.0,
            'f_slope': float(slope),
            'f_adiff_mean': float(adiff_mean),
            **ctx
        }
        if label is not None:
            row['label_rule'] = label

        out_rows.append(row)
        cur += pd.to_timedelta(step)

    feats = pd.DataFrame(out_rows).sort_values('t_center').reset_index(drop=True)
    return feats

feats = make_rolling_features(sub[['timestamp','value','traffic_load_proxy','rule_threshold','anomaly']].copy(),
                              window='30s', step='15s')
print(feats.shape)
feats.head()


## 5) Temporal split and scaling

In [None]:

# Temporal split (70/30)
cut = int(len(feats)*0.7)
train = feats.iloc[:cut].copy()
test  = feats.iloc[cut:].copy()

feature_cols = [c for c in feats.columns if c.startswith('f_') or c.startswith('ctx_')]
label_col = 'label_rule' if 'label_rule' in feats.columns else None

scaler = RobustScaler()
Xs_tr = scaler.fit_transform(train[feature_cols])
Xs_te = scaler.transform(test[feature_cols])

# Save scaler
import joblib
joblib.dump(scaler, os.path.join(ARTIFACT_DIR, "scaler.joblib"))
print("Saved:", os.path.join(ARTIFACT_DIR, "scaler.joblib"))


## 6) Unsupervised model: Isolation Forest

In [None]:

iso = IsolationForest(
    n_estimators=300,
    contamination='auto',
    random_state=42,
    n_jobs=-1
)
iso.fit(Xs_tr)

# Scores: higher -> more normal in sklearn's decision_function
score_tr = iso.decision_function(Xs_tr)
score_te = iso.decision_function(Xs_te)

# Save model
import joblib, json
joblib.dump(iso, os.path.join(ARTIFACT_DIR, "isoforest.pkl"))
print("Saved:", os.path.join(ARTIFACT_DIR, "isoforest.pkl"))


## 7) Threshold calibration

In [None]:

def pick_threshold(scores, quantile=0.005):
    # Convert to alerts for the lowest quantile of normality score
    thr = np.quantile(scores, quantile)
    return float(thr)

thr = pick_threshold(score_tr, quantile=0.005)  # ~0.5% most abnormal on train become alerts
print("Chosen threshold:", thr)

with open(os.path.join(ARTIFACT_DIR, "threshold.json"), "w") as f:
    json.dump({"score_threshold": thr}, f)
print("Saved:", os.path.join(ARTIFACT_DIR, "threshold.json"))


## 8) Evaluation (if weak labels exist)

In [None]:

alerts_te = (score_te < thr).astype(int)

if label_col is not None and label_col in test.columns:
    y_true = test[label_col].values
    # PR metrics (preferred for rare events)
    # For AP, we need a continuous score where higher means more likely positive.
    # We invert the normality score to an anomaly score:
    anomaly_score_te = -score_te
    ap = average_precision_score(y_true, anomaly_score_te)
    p, r, f1, _ = precision_recall_fscore_support(y_true, alerts_te, average='binary', zero_division=0)
    print(f"AP: {ap:.3f}  Precision: {p:.3f}  Recall: {r:.3f}  F1: {f1:.3f}")
else:
    print("No labels available; showing score distribution only.")

# Plot score timeline + threshold
fig = plt.figure()
plt.plot(test['t_center'], score_te, label='score')
plt.axhline(thr, linestyle='--')
plt.title("IsolationForest score (higher=more normal)")
plt.xlabel("time"); plt.ylabel("score")
plt.show()

# Alert markers
fig = plt.figure()
plt.plot(test['t_center'], alerts_te, drawstyle='steps-post')
plt.title("Alerts (test)")
plt.xlabel("time"); plt.ylabel("alert")
plt.show()


## 9) Mini scoring loop (stream-friendly illustration)

In [None]:

# Pseudostream: iterate windows over the test set and emit events
events = []
for t, s, a in zip(test['t_center'], score_te, (score_te < thr).astype(int)):
    if a == 1:
        events.append({"t": str(t), "event": "ALERT", "score": float(s)})
len(events), events[:5]


## 10) Next steps

- Expand features (FFT band powers for accel/strain).
- Per-sensor models vs. global per-type models.
- Cost-aware thresholding (target ≤ 1 false alarm/day).
- Add drift monitoring (score median + MAD over time).
- Integrate with Streamlit dashboard (plot raw signals + score + alerts).
