# ComplexBridge – Feature Consumer & Model Trainer
*Generated on 2025-10-15 16:40:07*

This notebook **consumes precomputed features from `ARTIFACT_DIR`** and trains per-**span × sensor_type** models:

- **Strain gauge** → Isolation Forest (tabular window features)  
- **Accelerometer** → 1D CNN Autoencoder on **feature sequences**  
- **Temperature** → SARIMAX (uses raw temperature streams from `DATA_PATH`)

> Expected feature files in `ARTIFACT_DIR`: `feats_strain.(parquet|csv)`, `feats_accel.(parquet|csv)`, `feats_temp.(parquet|csv)`.


## 1) Setup & Paths

In [None]:

# %pip install pandas numpy scikit-learn tensorflow statsmodels matplotlib pyarrow fastparquet

import os, json, joblib, numpy as np, pandas as pd
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import average_precision_score, precision_recall_fscore_support

import tensorflow as tf
from tensorflow.keras import layers, models

import statsmodels.api as sm

# Paths (user-specified)
DATA_PATH = "/content/drive/MyDrive/ComplexBridge_work/Data/"
ARTIFACT_DIR = "/content/drive/MyDrive/ComplexBridge_work/Artifacts/"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

print("DATA_PATH   :", DATA_PATH)
print("ARTIFACT_DIR:", ARTIFACT_DIR)


## 2) Feature Dictionaries & Constants

In [None]:

feature_dict = { 
    "strain_gauge":      ["f_mean","f_std","f_rms","f_p2p","f_slope","fft_low","fft_mid","fft_high"],
    "accelerometer_rms": ["f_rms","fft_centroid","fft_entropy","fft_dominant"],
    "temperature":       ["f_mean","f_slope","ctx_tl_mean"],
}

SEQ_LEN = 30  # number of feature-rows per CNN sequence

def save_json(path, obj):
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)

def model_path(span_id, sensor_type, name):
    d = os.path.join(ARTIFACT_DIR, f"{span_id}_{sensor_type}")
    os.makedirs(d, exist_ok=True)
    return os.path.join(d, name)


## 3) Load Precomputed Feature Matrices from `ARTIFACT_DIR`

In [None]:

def _try_read(base):
    pq = base + ".parquet"
    cs = base + ".csv"
    if os.path.exists(pq):
        return pd.read_parquet(pq)
    if os.path.exists(cs):
        return pd.read_csv(cs, parse_dates=['t_center'])
    return pd.DataFrame()

feats_strain = _try_read(os.path.join(ARTIFACT_DIR, "feats_strain"))
feats_accel  = _try_read(os.path.join(ARTIFACT_DIR, "feats_accel"))
feats_temp   = _try_read(os.path.join(ARTIFACT_DIR, "feats_temp"))

for name, F in [("strain", feats_strain), ("accel", feats_accel), ("temp", feats_temp)]:
    print(name, F.shape, list(F.columns)[:12])

# ensure datetime
for F in [feats_strain, feats_accel, feats_temp]:
    if not F.empty and not np.issubdtype(F['t_center'].dtype, np.datetime64):
        F['t_center'] = pd.to_datetime(F['t_center'], errors='coerce')


## 4) Load or Fit Per-Type Scalers

In [None]:

scalers: Dict[str, RobustScaler] = {}

def load_or_fit_scaler(sensor_type: str, F: pd.DataFrame) -> RobustScaler:
    path = os.path.join(ARTIFACT_DIR, f"scaler_{sensor_type}.joblib")
    feat_cols = [c for c in F.columns if c not in ('t_center','sensor_id','sensor_type','span_id','label_rule')]
    if os.path.exists(path):
        sc = joblib.load(path)
    else:
        sc = RobustScaler().fit(F[feat_cols])
        joblib.dump(sc, path)
    return sc

if not feats_strain.empty:
    scalers["strain_gauge"] = load_or_fit_scaler("strain_gauge", feats_strain)
if not feats_accel.empty:
    scalers["accelerometer_rms"] = load_or_fit_scaler("accelerometer_rms", feats_accel)
if not feats_temp.empty:
    scalers["temperature"] = load_or_fit_scaler("temperature", feats_temp)

print("Loaded scalers for:", list(scalers.keys()))


## 5) Train per-span Models – **Strain gauge → Isolation Forest**

In [None]:

import numpy as np

def train_isoforest_span(span_id: str, F: pd.DataFrame):
    s_type = "strain_gauge"
    if F.empty: 
        print(f"[{span_id}][{s_type}] no features")
        return None
    
    keep = ['t_center','sensor_id','sensor_type','span_id'] + feature_dict[s_type]
    if 'label_rule' in F.columns: keep += ['label_rule']
    keep = [c for c in keep if c in F.columns]
    G = F[(F['span_id']==span_id) & (F['sensor_type']==s_type)][keep].sort_values('t_center').reset_index(drop=True)
    if G.empty:
        print(f"[{span_id}][{s_type}] no rows after filtering")
        return None

    feat_cols = feature_dict[s_type]
    sc = scalers[s_type]
    Xs = sc.transform(G[feat_cols])

    iso = IsolationForest(n_estimators=300, contamination="auto", random_state=42, n_jobs=-1)
    iso.fit(Xs)
    scores = iso.decision_function(Xs)
    thr = float(np.quantile(scores, 0.005))

    joblib.dump(iso, model_path(span_id, s_type, "isoforest.pkl"))
    save_json( model_path(span_id, s_type, "threshold.json"),
               {"score_threshold": thr, "note": "score<thr => alert"} )
    print(f"[{span_id}][{s_type}] trained rows={len(G)}, thr={thr:.5f}")
    return {"scores": scores, "thr": thr}

if not feats_strain.empty:
    spans = feats_strain['span_id'].dropna().unique().tolist() if 'span_id' in feats_strain.columns else []
    for span in spans:
        try:
            train_isoforest_span(span, feats_strain)
        except Exception as e:
            print(f"[{span}][strain_gauge] ERROR:", e)


## 6) Train per-span Models – **Accelerometer → 1D CNN Autoencoder (feature sequences)**

In [None]:

def build_feature_sequences(G: pd.DataFrame, seq_len: int = SEQ_LEN):
    # Build sequences from feature rows per sensor, then stack for span-level model.
    feat_cols = feature_dict["accelerometer_rms"]
    X_seq = []
    for sid, g in G.groupby('sensor_id'):
        g = g.sort_values('t_center')
        X = g[feat_cols].to_numpy().astype('float32')
        for i in range(len(X) - seq_len):
            X_seq.append(X[i:i+seq_len])
    if not X_seq:
        return None
    X_seq = np.stack(X_seq)  # (n_seq, seq_len, feat_dim)
    return X_seq

def build_cnn_ae(seq_len: int, feat_dim: int, latent: int = 32):
    inp = layers.Input(shape=(seq_len, feat_dim))
    x = layers.Conv1D(64, 5, padding="same", activation="relu")(inp)
    x = layers.MaxPool1D(2)(x)
    x = layers.Conv1D(128, 3, padding="same", activation="relu")(x)
    x = layers.MaxPool1D(2)(x)
    x = layers.Conv1D(latent, 3, padding="same", activation="relu")(x)
    x = layers.UpSampling1D(2)(x)
    x = layers.Conv1D(128, 3, padding="same", activation="relu")(x)
    x = layers.UpSampling1D(2)(x)
    out = layers.Conv1D(feat_dim, 3, padding="same")(x)
    model = models.Model(inp, out)
    model.compile(optimizer="adam", loss="mse")
    return model

def train_cnn_span(span_id: str, F: pd.DataFrame):
    s_type = "accelerometer_rms"
    if F.empty:
        print(f"[{span_id}][{s_type}] no features")
        return None

    keep = ['t_center','sensor_id','sensor_type','span_id'] + feature_dict[s_type]
    if 'label_rule' in F.columns: keep += ['label_rule']
    keep = [c for c in keep if c in F.columns]
    G = F[(F['span_id']==span_id) & (F['sensor_type']==s_type)][keep].sort_values('t_center')
    if G.empty:
        print(f"[{span_id}][{s_type}] no rows after filtering")
        return None

    sc = scalers[s_type]
    feat_cols = feature_dict[s_type]
    G[feat_cols] = sc.transform(G[feat_cols])

    X_seq = build_feature_sequences(G, seq_len=SEQ_LEN)
    if X_seq is None:
        print(f"[{span_id}][{s_type}] insufficient sequences")
        return None

    n = len(X_seq)
    n_tr = max(int(n*0.8), 1)
    Xtr, Xva = X_seq[:n_tr], X_seq[n_tr:] if n_tr < n else (X_seq, X_seq[:0])

    model = build_cnn_ae(seq_len=SEQ_LEN, feat_dim=X_seq.shape[2], latent=32)
    model.fit(Xtr, Xtr, validation_data=(Xva, Xva) if len(Xva)>0 else None, epochs=20, batch_size=128, verbose=1)

    recon = model.predict(X_seq, verbose=0)
    err = np.mean((X_seq - recon)**2, axis=(1,2))
    thr = float(np.quantile(err, 0.995))

    model.save(model_path(span_id, s_type, "cnn_ae_features.h5"))
    save_json(model_path(span_id, s_type, "threshold.json"),
              {"recon_err_threshold": thr, "note": "err>thr => alert (on feature sequences)"})
    print(f"[{span_id}][{s_type}] trained seq={len(X_seq)}, thr={thr:.6f}")
    return {"thr": thr, "n_seq": len(X_seq)}

if not feats_accel.empty:
    spans = feats_accel['span_id'].dropna().unique().tolist() if 'span_id' in feats_accel.columns else []
    for span in spans:
        try:
            train_cnn_span(span, feats_accel)
        except Exception as e:
            print(f"[{span}][accelerometer_rms] ERROR:", e)


## 7) Train per-span Models – **Temperature → SARIMAX (raw streams)**

In [None]:

def fit_best_sarimax(y, orders=[(1,0,0),(1,1,0),(2,0,1),(2,1,1)]):
    best = None
    for (p,d,q) in orders:
        try:
            m = sm.tsa.SARIMAX(y, order=(p,d,q), enforce_stationarity=False, enforce_invertibility=False)
            r = m.fit(disp=False)
            if (best is None) or (r.aic < best[0]):
                best = (r.aic, r)
        except Exception:
            continue
    return best[1] if best else None

# Optional: enable temperature training if you provide DATA_PATH/raw.csv with temperature streams
ENABLE_TEMP = False

def train_temp_span_raw(span_id: str):
    raw_csv = os.path.join(DATA_PATH, "raw.csv")
    if not os.path.exists(raw_csv):
        print(f"[{span_id}][temperature] raw.csv not found -> skipping")
        return None
    df_raw = pd.read_csv(raw_csv, parse_dates=['timestamp'])
    df_raw = df_raw.sort_values('timestamp').set_index('timestamp')
    df_span = df_raw[(df_raw['span_id']==span_id) & (df_raw['sensor_type']=="temperature")].copy()
    if df_span.empty:
        print(f"[{span_id}][temperature] no raw rows")
        return None

    out = {}
    for sid, g in df_span.groupby('sensor_id'):
        y = g['value'].asfreq('1min').interpolate()
        model = fit_best_sarimax(y)
        if model is None: 
            continue
        model.save(model_path(span_id, "temperature", f"sarimax_{sid}.pkl"))
        out[sid] = {"aic": float(model.aic), "n": int(len(y))}
        print(f"[{span_id}][temperature][{sid}] AIC={model.aic:.1f}, n={len(y)}")
    save_json(model_path(span_id, "temperature", "index.json"), out)
    return out

if ENABLE_TEMP:
    spans = set()
    if 'span_id' in feats_temp.columns: spans.update(feats_temp['span_id'].dropna().unique().tolist())
    if 'span_id' in feats_strain.columns: spans.update(feats_strain['span_id'].dropna().unique().tolist())
    if 'span_id' in feats_accel.columns: spans.update(feats_accel['span_id'].dropna().unique().tolist())
    for span in spans:
        try:
            train_temp_span_raw(span)
        except Exception as e:
            print(f"[{span}][temperature] ERROR:", e)


## 8) Summary & Next Steps


- Loads **precomputed features** from `ARTIFACT_DIR` and trains **per-span** models:
  - Strain → Isolation Forest
  - Accelerometer → 1D CNN Autoencoder (feature sequences)
  - Temperature → SARIMAX (optional; requires raw temperature in `DATA_PATH/raw.csv`)
- Saves artifacts under: `ARTIFACT_DIR/{SPAN}_{sensor_type}/...`

**Next:** scoring notebook that loads these artifacts and produces anomaly scores for new windows.
