<a href="https://colab.research.google.com/github/TipsyPanda/ComplexBridges/blob/main/ClaudePlayground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IPMB – Multi-Model Ensemble Anomaly Detection
# Extended notebook with sensor-type-specific models

import os
import json
import numpy as np
import pandas as pd
from datetime import timedelta
from typing import Optional, Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# ML imports
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import precision_recall_fscore_support, average_precision_score
import matplotlib.pyplot as plt

# Deep learning imports (TensorFlow/Keras for LSTM Autoencoder & 1D CNN)
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, Model
    KERAS_AVAILABLE = True
except ImportError:
    KERAS_AVAILABLE = False
    print("Warning: TensorFlow not available. LSTM/CNN models will be skipped.")

# Time series imports for ARIMA
try:
    from statsmodels.tsa.arima.model import ARIMA
    from scipy.signal import medfilt
    STATSMODELS_AVAILABLE = True
except ImportError:
    STATSMODELS_AVAILABLE = False
    print("Warning: statsmodels not available. ARIMA models will be skipped.")

import joblib

In [None]:

# ==============================================================================
# CONFIGURATION
# ==============================================================================
DATA_PATH = "/content/drive/MyDrive/ComplexBridge_work/Data/"
ARTIFACT_DIR = "/content/drive/MyDrive/ComplexBridge_work/Artifacts/"

plt.rcParams["figure.figsize"] = (12, 4)

In [None]:
# ==============================================================================
# DATA LOADING
# ==============================================================================
from google.colab import drive
drive.mount('/content/drive/')

def load_df_with_datetimeindex(path: str, time_col: str = "timestamp"):
    """Load CSV/Pickle and ensure DatetimeIndex."""
    if path.lower().endswith((".pkl", ".pickle")):
        df = pd.read_pickle(path)
    else:
        df = pd.read_csv(path)

    if isinstance(df.index, pd.DatetimeIndex):
        return df.sort_index()

    if time_col in df.columns:
        df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
        df = df.dropna(subset=[time_col]).set_index(time_col).sort_index()
        return df

    try:
        idx = pd.to_datetime(df.index, errors='raise')
        df.index = idx
        return df.sort_index()
    except Exception:
        raise ValueError("No DatetimeIndex found. Provide 'timestamp' column.")


ValueError: Mountpoint must not already contain files

In [None]:
# ==============================================================================
# FEATURE ENGINEERING
# ==============================================================================
def infer_base_freq(idx: pd.DatetimeIndex) -> str:
    """Infer base frequency from DatetimeIndex."""
    guessed = pd.infer_freq(idx)
    if guessed is not None:
        return guessed
    deltas = idx.to_series().diff().dropna().dt.total_seconds()
    if len(deltas) == 0:
        return "1S"
    ms = int(np.median(deltas) * 1000)
    ms = max(ms, 1)
    return f"{ms}L"

def make_rolling_features(ts: pd.DataFrame, window: str = '30s', step: str = '15s'):
    """Create rolling window features for traditional ML models."""
    if not isinstance(ts.index, pd.DatetimeIndex):
        raise TypeError("Expected DatetimeIndex")
    ts = ts.sort_index()

    base_freq = infer_base_freq(ts.index)
    ts = ts.resample(base_freq).ffill()

    start, end = ts.index.min(), ts.index.max()
    out = []
    cur = start
    w = pd.to_timedelta(window)
    s = pd.to_timedelta(step)

    while cur + w <= end:
        win = ts.loc[cur:cur+w]
        if len(win) < 3:
            cur += s
            continue

        v = win['value'].values
        mean = float(np.mean(v))
        std = float(np.std(v, ddof=1)) if len(v) > 1 else 0.0
        rms = float(np.sqrt(np.mean(v**2)))
        p2p = float(np.max(v) - np.min(v))
        skew = float(pd.Series(v).skew()) if len(v) > 2 else 0.0
        kurt = float(pd.Series(v).kurtosis()) if len(v) > 3 else 0.0
        x = np.arange(len(v))
        slope = float(np.polyfit(x, v, 1)[0]) if len(v) > 1 else 0.0
        dv = np.diff(v)
        adiff_mean = float(np.mean(np.abs(dv))) if len(dv) else 0.0

        ctx = {}
        if 'traffic_load_proxy' in win.columns:
            tl = win['traffic_load_proxy'].values
            ctx['ctx_tl_mean'] = float(np.mean(tl))
            ctx['ctx_tl_std'] = float(np.std(tl, ddof=1)) if len(tl)>1 else 0.0
        if 'rule_threshold' in win.columns:
            ctx['ctx_rule_thr'] = float(np.median(win['rule_threshold']))

        label = None
        if 'anomaly' in win.columns:
            label = int((win['anomaly'] == 1).any())

        t_center = cur + w/2
        row = {
            't_center': t_center,
            'f_mean': mean, 'f_std': std, 'f_rms': rms, 'f_p2p': p2p,
            'f_skew': skew, 'f_kurt': kurt, 'f_slope': slope, 'f_adiff_mean': adiff_mean,
            **ctx
        }
        if label is not None:
            row['label_rule'] = label
        out.append(row)
        cur += s

    return pd.DataFrame(out).sort_values('t_center').reset_index(drop=True)

def create_sequences(data: np.ndarray, seq_len: int = 50):
    """Create sequences for LSTM/CNN models."""
    sequences = []
    for i in range(len(data) - seq_len + 1):
        sequences.append(data[i:i+seq_len])
    return np.array(sequences)

In [None]:
# ==============================================================================
# MODEL DEFINITIONS
# ==============================================================================
class LSTMAutoencoder:
    """LSTM Autoencoder for strain gauge and accelerometer data."""
    def __init__(self, seq_len: int = 50, latent_dim: int = 16):
        self.seq_len = seq_len
        self.latent_dim = latent_dim
        self.model = None
        self.scaler = RobustScaler()
        self.threshold = None

    def build_model(self):
        if not KERAS_AVAILABLE:
            raise RuntimeError("TensorFlow not available")

        # Encoder
        encoder_input = layers.Input(shape=(self.seq_len, 1))
        x = layers.LSTM(64, return_sequences=True)(encoder_input)
        x = layers.LSTM(32, return_sequences=False)(x)
        encoded = layers.Dense(self.latent_dim)(x)

        # Decoder
        x = layers.RepeatVector(self.seq_len)(encoded)
        x = layers.LSTM(32, return_sequences=True)(x)
        x = layers.LSTM(64, return_sequences=True)(x)
        decoded = layers.TimeDistributed(layers.Dense(1))(x)

        self.model = Model(encoder_input, decoded)
        self.model.compile(optimizer='adam', loss='mse')

    def fit(self, data: np.ndarray, epochs: int = 50):
        data_scaled = self.scaler.fit_transform(data.reshape(-1, 1)).flatten()
        sequences = create_sequences(data_scaled, self.seq_len)
        X = sequences.reshape(-1, self.seq_len, 1)

        if self.model is None:
            self.build_model()

        self.model.fit(X, X, epochs=epochs, batch_size=32, verbose=0)

        # Calculate reconstruction errors for threshold
        recon = self.model.predict(X, verbose=0)
        errors = np.mean(np.abs(X - recon), axis=(1, 2))
        self.threshold = np.percentile(errors, 99.5)

    def score(self, data: np.ndarray) -> np.ndarray:
        """Return anomaly scores (higher = more anomalous)."""
        data_scaled = self.scaler.transform(data.reshape(-1, 1)).flatten()
        sequences = create_sequences(data_scaled, self.seq_len)
        X = sequences.reshape(-1, self.seq_len, 1)

        recon = self.model.predict(X, verbose=0)
        errors = np.mean(np.abs(X - recon), axis=(1, 2))
        return errors

class CNN1D:
    """1D CNN for accelerometer oscillation patterns."""
    def __init__(self, seq_len: int = 100):
        self.seq_len = seq_len
        self.model = None
        self.scaler = RobustScaler()
        self.threshold = None

    def build_model(self):
        if not KERAS_AVAILABLE:
            raise RuntimeError("TensorFlow not available")

        input_layer = layers.Input(shape=(self.seq_len, 1))

        # Encoder
        x = layers.Conv1D(32, 3, activation='relu', padding='same')(input_layer)
        x = layers.MaxPooling1D(2, padding='same')(x)
        x = layers.Conv1D(16, 3, activation='relu', padding='same')(x)
        encoded = layers.MaxPooling1D(2, padding='same')(x)

        # Decoder
        x = layers.Conv1D(16, 3, activation='relu', padding='same')(encoded)
        x = layers.UpSampling1D(2)(x)
        x = layers.Conv1D(32, 3, activation='relu', padding='same')(x)
        x = layers.UpSampling1D(2)(x)
        decoded = layers.Conv1D(1, 3, activation='linear', padding='same')(x)

        self.model = Model(input_layer, decoded)
        self.model.compile(optimizer='adam', loss='mse')

    def fit(self, data: np.ndarray, epochs: int = 50):
        data_scaled = self.scaler.fit_transform(data.reshape(-1, 1)).flatten()
        sequences = create_sequences(data_scaled, self.seq_len)
        X = sequences.reshape(-1, self.seq_len, 1)

        if self.model is None:
            self.build_model()

        self.model.fit(X, X, epochs=epochs, batch_size=32, verbose=0)

        recon = self.model.predict(X, verbose=0)
        errors = np.mean(np.abs(X - recon), axis=(1, 2))
        self.threshold = np.percentile(errors, 99.5)

    def score(self, data: np.ndarray) -> np.ndarray:
        data_scaled = self.scaler.transform(data.reshape(-1, 1)).flatten()
        sequences = create_sequences(data_scaled, self.seq_len)
        X = sequences.reshape(-1, self.seq_len, 1)

        recon = self.model.predict(X, verbose=0)
        errors = np.mean(np.abs(X - recon), axis=(1, 2))
        return errors

class ARIMADetector:
    """ARIMA-based detector for temperature (slow trends)."""
    def __init__(self, order=(2, 1, 2)):
        self.order = order
        self.model = None
        self.threshold = None

    def fit(self, data: np.ndarray):
        if not STATSMODELS_AVAILABLE:
            print("ARIMA unavailable, using simple moving average")
            return

        try:
            self.model = ARIMA(data, order=self.order).fit()
            residuals = np.abs(self.model.resid)
            self.threshold = np.percentile(residuals, 99.5)
        except Exception as e:
            print(f"ARIMA fit failed: {e}, using fallback")
            self.model = None

    def score(self, data: np.ndarray) -> np.ndarray:
        if self.model is None:
            # Fallback: simple moving average residuals
            ma = pd.Series(data).rolling(10, min_periods=1).mean().values
            return np.abs(data - ma)

        try:
            forecast = self.model.forecast(steps=len(data))
            return np.abs(data - forecast)
        except:
            ma = pd.Series(data).rolling(10, min_periods=1).mean().values
            return np.abs(data - ma)

In [None]:
# ==============================================================================
# ENSEMBLE SYSTEM
# ==============================================================================
class EnsembleAnomalyDetector:
    """Ensemble detector with sensor-type-specific models."""
    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.thresholds = {}

    def fit_sensor_type(self, sensor_type: str, data: pd.DataFrame):
        """Fit appropriate model for sensor type."""
        print(f"\n{'='*60}")
        print(f"Training model for: {sensor_type}")
        print(f"{'='*60}")

        values = data['value'].values

        if sensor_type == 'strain_gauge':
            # Isolation Forest for explainability
            feats = make_rolling_features(data, window='30s', step='15s')
            feature_cols = [c for c in feats.columns if c.startswith('f_') or c.startswith('ctx_')]

            scaler = RobustScaler()
            X_scaled = scaler.fit_transform(feats[feature_cols])

            model = IsolationForest(n_estimators=300, contamination=0.005, random_state=42, n_jobs=-1)
            model.fit(X_scaled)
            scores = model.decision_function(X_scaled)
            threshold = np.percentile(scores, 0.5)

            self.models[sensor_type] = model
            self.scalers[sensor_type] = scaler
            self.thresholds[sensor_type] = threshold
            print(f"✓ Isolation Forest trained (threshold: {threshold:.4f})")

        elif sensor_type == 'accelerometer_rms':
            if KERAS_AVAILABLE:
                # 1D CNN for oscillations
                model = CNN1D(seq_len=100)
                model.fit(values, epochs=30)
                self.models[sensor_type] = model
                print(f"✓ 1D CNN trained (threshold: {model.threshold:.6f})")
            else:
                # Fallback to Isolation Forest
                feats = make_rolling_features(data, window='10s', step='5s')
                feature_cols = [c for c in feats.columns if c.startswith('f_')]
                scaler = RobustScaler()
                X_scaled = scaler.fit_transform(feats[feature_cols])
                model = IsolationForest(n_estimators=200, contamination=0.01, random_state=42)
                model.fit(X_scaled)
                scores = model.decision_function(X_scaled)
                threshold = np.percentile(scores, 1)
                self.models[sensor_type] = model
                self.scalers[sensor_type] = scaler
                self.thresholds[sensor_type] = threshold
                print(f"✓ Isolation Forest (fallback) trained")

        elif sensor_type == 'temperature':
            # ARIMA for slow trends
            model = ARIMADetector(order=(2, 1, 2))
            model.fit(values)
            self.models[sensor_type] = model
            print(f"✓ ARIMA trained")

    def score_sensor(self, sensor_type: str, data: pd.DataFrame) -> np.ndarray:
        """Score data for given sensor type."""
        if sensor_type not in self.models:
            print(f"Warning: No model for {sensor_type}")
            return np.zeros(len(data))

        model = self.models[sensor_type]

        if sensor_type == 'strain_gauge':
            feats = make_rolling_features(data, window='30s', step='15s')
            feature_cols = [c for c in feats.columns if c.startswith('f_') or c.startswith('ctx_')]
            X_scaled = self.scalers[sensor_type].transform(feats[feature_cols])
            scores = model.decision_function(X_scaled)
            # Convert to anomaly scores (higher = more anomalous)
            return -scores

        elif sensor_type == 'accelerometer_rms':
            if isinstance(model, CNN1D):
                return model.score(data['value'].values)
            else:
                feats = make_rolling_features(data, window='10s', step='5s')
                feature_cols = [c for c in feats.columns if c.startswith('f_')]
                X_scaled = self.scalers[sensor_type].transform(feats[feature_cols])
                scores = model.decision_function(X_scaled)
                return -scores

        elif sensor_type == 'temperature':
            return model.score(data['value'].values)

        return np.zeros(len(data))

    def detect_anomalies(self, sensor_type: str, data: pd.DataFrame) -> np.ndarray:
        """Detect anomalies with sensor-specific threshold."""
        scores = self.score_sensor(sensor_type, data)

        if sensor_type == 'strain_gauge':
            threshold = self.thresholds.get(sensor_type, 0)
            return (scores > -threshold).astype(int)
        elif isinstance(self.models.get(sensor_type), (CNN1D, LSTMAutoencoder)):
            threshold = self.models[sensor_type].threshold
            return (scores > threshold).astype(int)
        elif isinstance(self.models.get(sensor_type), ARIMADetector):
            threshold = self.models[sensor_type].threshold or np.percentile(scores, 99.5)
            return (scores > threshold).astype(int)
        else:
            threshold = np.percentile(scores, 99.5)
            return (scores > threshold).astype(int)

In [None]:
# ==============================================================================
# MAIN EXECUTION
# ==============================================================================
if __name__ == "__main__":
    # Load data
    print("Loading data...")
    df = load_df_with_datetimeindex(os.path.join(DATA_PATH, "synthetic_bridge_data.csv"))
    print(f"Loaded {len(df)} rows")
    print(f"Sensor types: {df['sensor_type'].unique()}")

    # Initialize ensemble
    ensemble = EnsembleAnomalyDetector()

    # Train models for each sensor type
    for sensor_type in df['sensor_type'].unique():
        sensor_data = df[df['sensor_type'] == sensor_type].copy()

        # Use first sensor as representative
        first_sensor = sensor_data['sensor_id'].iloc[0]
        train_data = sensor_data[sensor_data['sensor_id'] == first_sensor].iloc[:int(len(sensor_data)*0.7)]

        ensemble.fit_sensor_type(sensor_type, train_data)

    # Save ensemble
    joblib.dump(ensemble, os.path.join(ARTIFACT_DIR, "ensemble_detector.pkl"))
    print(f"\n✓ Ensemble saved to {ARTIFACT_DIR}")

    # Evaluate on test set
    print("\n" + "="*60)
    print("EVALUATION")
    print("="*60)

    results = {}
    for sensor_type in df['sensor_type'].unique():
        sensor_data = df[df['sensor_type'] == sensor_type].copy()
        first_sensor = sensor_data['sensor_id'].iloc[0]
        test_data = sensor_data[sensor_data['sensor_id'] == first_sensor].iloc[int(len(sensor_data)*0.7):]

        if len(test_data) < 100:
            continue

        alerts = ensemble.detect_anomalies(sensor_type, test_data)

        if 'anomaly' in test_data.columns:
            y_true = test_data['anomaly'].values
            if len(alerts) == len(y_true):
                p, r, f1, _ = precision_recall_fscore_support(y_true, alerts, average='binary', zero_division=0)
                results[sensor_type] = {'precision': p, 'recall': r, 'f1': f1}
                print(f"\n{sensor_type}:")
                print(f"  Precision: {p:.3f}  Recall: {r:.3f}  F1: {f1:.3f}")

    print("\n" + "="*60)
    print("TRAINING COMPLETE")
    print("="*60)
    print(f"Models saved to: {ARTIFACT_DIR}")
    print("Artifacts:")
    print("  - ensemble_detector.pkl")
    print("\nNext steps:")
    print("  1. Deploy ensemble to production pipeline")
    print("  2. Monitor model drift with rolling statistics")
    print("  3. Implement score fusion for multi-sensor alerts")
    print("  4. Set up real-time Streamlit dashboard")

Loading data...


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/ComplexSystemDesign/Data/synthetic_bridge_data.csv'