# SWIM Research — Phase 2: Deep Learning Baselines (GPU)

**Builds on:** Phase 1 results (AUROC 0.814 ensemble baseline)

### What this notebook does:
1. Loads Phase 1 research dataset (train/val/test parquets)
2. Creates temporal sequences (7-day sliding windows) for LSTM
3. Trains LSTM baseline
4. Trains lightweight Transformer baseline
5. Trains XGBoost (proper gradient boosting baseline)
6. Runs **modality ablation** (in-situ only, satellite only, all) — fixes Phase 1 gap
7. Compares ALL models: GB, RF, Ensemble, XGBoost, LSTM, Transformer
8. Runs **modality dropout experiment** (RQ1 core experiment)
9. Exports everything as ZIP

### Requires:
- GPU (LSTM + Transformer training)
- Phase 1 results (train/val/test parquets) OR the original data

In [None]:
# ─── 0. Setup ───
import os, sys
IN_COLAB = 'google.colab' in sys.modules

# Install if needed
# !pip install -q torch pandas numpy matplotlib seaborn scikit-learn xgboost pyarrow

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
import zipfile
import warnings
from pathlib import Path
from datetime import datetime
from collections import OrderedDict
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import (roc_auc_score, average_precision_score, brier_score_loss,
                             roc_curve, precision_recall_curve)
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve

try:
    import xgboost as xgb
    HAS_XGB = True
    print('XGBoost available')
except ImportError:
    HAS_XGB = False
    print('XGBoost not installed, will skip. Install with: pip install xgboost')

# GPU check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'  GPU: {torch.cuda.get_device_name(0)}')
    print(f'  Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

RESULTS_DIR = Path('results_phase2')
RESULTS_DIR.mkdir(exist_ok=True)
(RESULTS_DIR / 'figures').mkdir(exist_ok=True)
(RESULTS_DIR / 'models').mkdir(exist_ok=True)

np.random.seed(42)
torch.manual_seed(42)
print('Setup complete.')

---
## 1. Load Phase 1 Data

In [None]:
# Try loading Phase 1 outputs, fall back to raw data
phase1_dir = Path('results/data')
if not phase1_dir.exists():
    phase1_dir = Path('../results/data')

if (phase1_dir / 'unified_research_dataset.parquet').exists():
    print('Loading Phase 1 outputs...')
    research_df = pd.read_parquet(phase1_dir / 'unified_research_dataset.parquet')
    print(f'Loaded: {len(research_df):,} rows, {len(research_df.columns)} cols')
else:
    print('Phase 1 outputs not found. Run Phase 1 notebook first.')
    print(f'Looked in: {phase1_dir.absolute()}')
    raise FileNotFoundError('Run Phase 1 first')

print(f'Lakes: {research_df["lake"].value_counts().to_dict()}')
print(f'Date range: {research_df["date"].min()} → {research_df["date"].max()}')
print(f'Bloom rate: {research_df["bloom_label"].mean():.3f}')

In [None]:
# ─── Define feature sets ───
INSITU_FEATURES = ['chlorophyll_a', 'turbidity', 'dissolved_oxygen', 'ph',
                   'temperature', 'conductivity', 'wind_speed', 'air_temperature', 'humidity']

SATELLITE_FEATURES = ['ndvi', 'surface_temperature', 'chlorophyll_index',
                      'turbidity_index', 'cloud_coverage']

ALL_FEATURES = [f for f in INSITU_FEATURES + SATELLITE_FEATURES if f in research_df.columns]
TARGET = 'bloom_label'

print(f'Features ({len(ALL_FEATURES)}): {ALL_FEATURES}')

In [None]:
# ─── Stratified split (same as Phase 1 fallback) ───
X_all = research_df[ALL_FEATURES].values
y_all = research_df[TARGET].values

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

X_all_imp = imputer.fit_transform(X_all)
X_all_scaled = scaler.fit_transform(X_all_imp)

X_train, X_temp, y_train, y_temp = train_test_split(
    X_all_scaled, y_all, test_size=0.3, random_state=42, stratify=y_all)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f'Train: {X_train.shape} | bloom={y_train.mean():.3f}')
print(f'Val:   {X_val.shape} | bloom={y_val.mean():.3f}')
print(f'Test:  {X_test.shape} | bloom={y_test.mean():.3f}')

---
## 2. Create Temporal Sequences for LSTM

In [None]:
# ─── Build sequences per lake (7-day sliding window) ───
SEQ_LEN = 7

def create_sequences(df, features, target, seq_len, scaler_fit=None):
    """Create sliding window sequences per lake."""
    sequences = []
    labels = []
    
    imp = SimpleImputer(strategy='median')
    sc = scaler_fit if scaler_fit else StandardScaler()
    
    df_sorted = df.sort_values(['lake', 'date']).copy()
    X_raw = df_sorted[features].values
    
    if scaler_fit is None:
        X_proc = sc.fit_transform(imp.fit_transform(X_raw))
    else:
        X_proc = sc.transform(imp.transform(X_raw))
    
    y_raw = df_sorted[target].values
    lake_ids = df_sorted['lake'].values
    
    for i in range(seq_len, len(X_proc)):
        # Only create sequence if all days are from the same lake
        if len(set(lake_ids[i-seq_len:i+1])) == 1:
            sequences.append(X_proc[i-seq_len:i])
            labels.append(y_raw[i])
    
    return np.array(sequences), np.array(labels), sc, imp

# Split data by time first, then create sequences
df_sorted = research_df.sort_values('date').reset_index(drop=True)
n = len(df_sorted)
train_df = df_sorted.iloc[:int(n*0.7)]
val_df = df_sorted.iloc[int(n*0.7):int(n*0.85)]
test_df = df_sorted.iloc[int(n*0.85):]

X_seq_train, y_seq_train, seq_scaler, seq_imputer = create_sequences(
    train_df, ALL_FEATURES, TARGET, SEQ_LEN)
X_seq_val, y_seq_val, _, _ = create_sequences(
    val_df, ALL_FEATURES, TARGET, SEQ_LEN, scaler_fit=seq_scaler)
X_seq_test, y_seq_test, _, _ = create_sequences(
    test_df, ALL_FEATURES, TARGET, SEQ_LEN, scaler_fit=seq_scaler)

print(f'Sequence length: {SEQ_LEN} days')
print(f'Train sequences: {X_seq_train.shape} | bloom={y_seq_train.mean():.3f}')
print(f'Val sequences:   {X_seq_val.shape} | bloom={y_seq_val.mean():.3f}')
print(f'Test sequences:  {X_seq_test.shape} | bloom={y_seq_test.mean():.3f}')

# Check if we have enough sequences with both classes
has_seq_both_train = len(np.unique(y_seq_train)) > 1
has_seq_both_test = len(np.unique(y_seq_test)) > 1
print(f'\nTrain has both classes: {has_seq_both_train}')
print(f'Test has both classes: {has_seq_both_test}')

if not has_seq_both_train or not has_seq_both_test:
    print('\n  Temporal sequences lack both classes. Using stratified approach for LSTM too.')
    # Create sequences from full data, then stratified split
    X_seq_all, y_seq_all, seq_scaler, seq_imputer = create_sequences(
        research_df, ALL_FEATURES, TARGET, SEQ_LEN)
    
    X_seq_train, X_seq_temp, y_seq_train, y_seq_temp = train_test_split(
        X_seq_all, y_seq_all, test_size=0.3, random_state=42, stratify=y_seq_all)
    X_seq_val, X_seq_test, y_seq_val, y_seq_test = train_test_split(
        X_seq_temp, y_seq_temp, test_size=0.5, random_state=42, stratify=y_seq_temp)
    
    print(f'  Stratified Train: {X_seq_train.shape} | bloom={y_seq_train.mean():.3f}')
    print(f'  Stratified Val:   {X_seq_val.shape} | bloom={y_seq_val.mean():.3f}')
    print(f'  Stratified Test:  {X_seq_test.shape} | bloom={y_seq_test.mean():.3f}')

---
## 3. LSTM Model

In [None]:
# ─── 3A: LSTM Architecture ───
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers,
                           batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(hidden_dim, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)
    
    def forward(self, x):
        lstm_out, (h_n, c_n) = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]  # last timestep
        out = self.dropout(last_hidden)
        out = self.relu(self.fc1(out))
        out = self.fc2(out)
        return out.squeeze(-1)

print(f'LSTM: input_dim={len(ALL_FEATURES)}, hidden=64, layers=2, dropout=0.3')

In [None]:
# ─── 3B: Train LSTM ───
def train_torch_model(model, X_train, y_train, X_val, y_val,
                      epochs=100, batch_size=64, lr=0.001, patience=15):
    """Train a PyTorch model with early stopping."""
    train_ds = TensorDataset(
        torch.FloatTensor(X_train).to(device),
        torch.FloatTensor(y_train).to(device))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)
    
    # Class weight for imbalanced data
    pos_weight = torch.tensor([(1 - y_train.mean()) / max(y_train.mean(), 0.01)]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    
    best_val_loss = float('inf')
    best_state = None
    wait = 0
    history = {'train_loss': [], 'val_loss': [], 'val_auroc': []}
    
    for epoch in range(epochs):
        model.train()
        train_losses = []
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            train_losses.append(loss.item())
        
        model.eval()
        with torch.no_grad():
            val_logits = model(X_val_t)
            val_loss = criterion(val_logits, y_val_t).item()
            val_probs = torch.sigmoid(val_logits).cpu().numpy()
            try:
                val_auroc = roc_auc_score(y_val, val_probs)
            except ValueError:
                val_auroc = 0.5
        
        scheduler.step(val_loss)
        history['train_loss'].append(np.mean(train_losses))
        history['val_loss'].append(val_loss)
        history['val_auroc'].append(val_auroc)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = {k: v.clone() for k, v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1
        
        if (epoch + 1) % 10 == 0:
            print(f'  Epoch {epoch+1:3d} | train_loss={np.mean(train_losses):.4f} | '
                  f'val_loss={val_loss:.4f} | val_auroc={val_auroc:.4f}')
        
        if wait >= patience:
            print(f'  Early stopping at epoch {epoch+1}')
            break
    
    model.load_state_dict(best_state)
    return model, history

# Train LSTM
print('Training LSTM...')
lstm_model = LSTMClassifier(input_dim=len(ALL_FEATURES)).to(device)
lstm_model, lstm_history = train_torch_model(
    lstm_model, X_seq_train, y_seq_train, X_seq_val, y_seq_val,
    epochs=100, batch_size=64, lr=0.001, patience=15)
print('LSTM training complete.')

---
## 4. Transformer Model

In [None]:
# ─── 4A: Lightweight Transformer ───
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2, dropout=0.3):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(1, 100, d_model) * 0.1)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=128,
            dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(d_model, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)
    
    def forward(self, x):
        seq_len = x.size(1)
        x = self.input_proj(x)
        x = x + self.pos_encoding[:, :seq_len, :]
        x = self.transformer(x)
        x = x.mean(dim=1)  # global average pooling
        x = self.dropout(x)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x.squeeze(-1)

print(f'Transformer: d_model=64, heads=4, layers=2')

In [None]:
# ─── 4B: Train Transformer ───
print('Training Transformer...')
transformer_model = TransformerClassifier(input_dim=len(ALL_FEATURES)).to(device)
transformer_model, transformer_history = train_torch_model(
    transformer_model, X_seq_train, y_seq_train, X_seq_val, y_seq_val,
    epochs=100, batch_size=64, lr=0.0005, patience=15)
print('Transformer training complete.')

---
## 5. Tabular Baselines (XGBoost + Phase 1 models)

In [None]:
# ─── 5A: Train all tabular models ───
tabular_models = OrderedDict()

tabular_models['GradientBoosting'] = GradientBoostingClassifier(
    n_estimators=200, max_depth=5, learning_rate=0.1,
    subsample=0.8, min_samples_leaf=10, random_state=42)

tabular_models['RandomForest'] = RandomForestClassifier(
    n_estimators=200, max_depth=10, min_samples_leaf=5,
    random_state=42, n_jobs=-1)

if HAS_XGB:
    tabular_models['XGBoost'] = xgb.XGBClassifier(
        n_estimators=200, max_depth=6, learning_rate=0.1,
        subsample=0.8, colsample_bytree=0.8,
        scale_pos_weight=(1 - y_train.mean()) / max(y_train.mean(), 0.01),
        random_state=42, use_label_encoder=False, eval_metric='logloss')

for name, model in tabular_models.items():
    t0 = datetime.now()
    model.fit(X_train, y_train)
    elapsed = (datetime.now() - t0).total_seconds()
    print(f'  {name}: trained in {elapsed:.1f}s')

print('All tabular models trained.')

---
## 6. Unified Evaluation — ALL Models

In [None]:
# ─── 6A: Evaluation function ───
def evaluate(name, y_true, y_prob):
    y_pred = (y_prob >= 0.5).astype(int)
    has_both = len(np.unique(y_true)) > 1
    return {
        'model': name,
        'AUROC': roc_auc_score(y_true, y_prob) if has_both else np.nan,
        'AUPRC': average_precision_score(y_true, y_prob) if has_both else np.nan,
        'Brier': brier_score_loss(y_true, y_prob),
        'Accuracy': (y_pred == y_true).mean(),
        'TP': int(((y_pred == 1) & (y_true == 1)).sum()),
        'FP': int(((y_pred == 1) & (y_true == 0)).sum()),
        'FN': int(((y_pred == 0) & (y_true == 1)).sum()),
        'TN': int(((y_pred == 0) & (y_true == 0)).sum()),
    }

def get_torch_probs(model, X):
    model.eval()
    with torch.no_grad():
        logits = model(torch.FloatTensor(X).to(device))
        return torch.sigmoid(logits).cpu().numpy()

# Collect all predictions
all_results = []

# Tabular models (on flat features)
ens_prob = np.zeros(len(y_test))
for name, model in tabular_models.items():
    prob = model.predict_proba(X_test)[:, 1]
    ens_prob += prob / len(tabular_models)
    all_results.append(evaluate(name, y_test, prob))

all_results.append(evaluate('Tabular Ensemble', y_test, ens_prob))

# LSTM (on sequences)
lstm_prob = get_torch_probs(lstm_model, X_seq_test)
all_results.append(evaluate('LSTM', y_seq_test, lstm_prob))

# Transformer (on sequences)
transformer_prob = get_torch_probs(transformer_model, X_seq_test)
all_results.append(evaluate('Transformer', y_seq_test, transformer_prob))

# DL Ensemble
dl_ens_prob = (lstm_prob + transformer_prob) / 2
all_results.append(evaluate('DL Ensemble (LSTM+Trans)', y_seq_test, dl_ens_prob))

# Display
print('ALL MODELS — TEST SET RESULTS')
print('=' * 90)
results_df = pd.DataFrame(all_results)
display(results_df.sort_values('AUROC', ascending=False).reset_index(drop=True))

In [None]:
# ─── 6B: Training curves (LSTM & Transformer) ───
fig, axes = plt.subplots(1, 3, figsize=(20, 5))

for hist, name, color in [(lstm_history, 'LSTM', 'steelblue'),
                           (transformer_history, 'Transformer', 'coral')]:
    axes[0].plot(hist['train_loss'], label=f'{name} train', color=color, linewidth=2)
    axes[0].plot(hist['val_loss'], '--', label=f'{name} val', color=color, linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss', fontweight='bold')
axes[0].legend()

for hist, name, color in [(lstm_history, 'LSTM', 'steelblue'),
                           (transformer_history, 'Transformer', 'coral')]:
    axes[1].plot(hist['val_auroc'], label=name, color=color, linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('AUROC')
axes[1].set_title('Validation AUROC', fontweight='bold')
axes[1].legend()

# Model comparison bar chart
plot_df = results_df.dropna(subset=['AUROC']).sort_values('AUROC', ascending=True)
colors = ['steelblue' if 'LSTM' not in m and 'Trans' not in m else 'coral'
          for m in plot_df['model']]
axes[2].barh(plot_df['model'], plot_df['AUROC'], color=colors, edgecolor='white')
axes[2].set_xlabel('AUROC')
axes[2].set_title('Model Comparison (Test AUROC)', fontweight='bold')
axes[2].axvline(0.5, color='gray', linestyle=':', alpha=0.5)
for i, (_, row) in enumerate(plot_df.iterrows()):
    axes[2].text(row['AUROC'] + 0.005, i, f'{row["AUROC"]:.3f}', va='center', fontsize=9)

plt.suptitle('Phase 2: Deep Learning Training & Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'figures' / 'fig1_training_curves.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# ─── 6C: ROC + PR curves for all models ───
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Collect all (name, y_true, y_prob) pairs
model_preds = []
for name, model in tabular_models.items():
    model_preds.append((name, y_test, model.predict_proba(X_test)[:, 1]))
model_preds.append(('LSTM', y_seq_test, lstm_prob))
model_preds.append(('Transformer', y_seq_test, transformer_prob))

# ROC
for name, yt, yp in model_preds:
    if len(np.unique(yt)) > 1:
        fpr, tpr, _ = roc_curve(yt, yp)
        auc = roc_auc_score(yt, yp)
        ls = '--' if name in ['LSTM', 'Transformer'] else '-'
        axes[0].plot(fpr, tpr, ls, label=f'{name} ({auc:.3f})', linewidth=2)
axes[0].plot([0, 1], [0, 1], 'k:', alpha=0.3)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curves — All Models', fontweight='bold')
axes[0].legend(fontsize=9)

# PR
for name, yt, yp in model_preds:
    if len(np.unique(yt)) > 1:
        prec, rec, _ = precision_recall_curve(yt, yp)
        ap = average_precision_score(yt, yp)
        ls = '--' if name in ['LSTM', 'Transformer'] else '-'
        axes[1].plot(rec, prec, ls, label=f'{name} ({ap:.3f})', linewidth=2)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('PR Curves — All Models', fontweight='bold')
axes[1].legend(fontsize=9)

plt.suptitle('Phase 2: Full Model Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'figures' / 'fig2_roc_pr_all_models.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 7. Modality Ablation (RQ1 Core Experiment)
What happens when we **remove** satellite OR in-situ features?

In [None]:
# ─── 7A: Ablation across all model types ───
insitu_avail = [f for f in INSITU_FEATURES if f in ALL_FEATURES]
sat_avail = [f for f in SATELLITE_FEATURES if f in ALL_FEATURES]

feature_configs = {
    'All Features': ALL_FEATURES,
    'In-Situ Only': insitu_avail,
    'Satellite Only': sat_avail,
}

ablation_results = []

for config_name, feats in feature_configs.items():
    if len(feats) == 0:
        continue
    
    # Get feature indices
    feat_idx = [ALL_FEATURES.index(f) for f in feats]
    
    # Tabular models
    X_tr_sub = X_train[:, feat_idx]
    X_te_sub = X_test[:, feat_idx]
    
    gb_abl = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                        subsample=0.8, random_state=42)
    gb_abl.fit(X_tr_sub, y_train)
    prob_gb = gb_abl.predict_proba(X_te_sub)[:, 1]
    m = evaluate(f'GB — {config_name}', y_test, prob_gb)
    m['config'] = config_name
    m['model_type'] = 'GradientBoosting'
    m['n_features'] = len(feats)
    ablation_results.append(m)
    
    if HAS_XGB:
        xgb_abl = xgb.XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1,
                                     subsample=0.8, random_state=42,
                                     use_label_encoder=False, eval_metric='logloss')
        xgb_abl.fit(X_tr_sub, y_train)
        prob_xgb = xgb_abl.predict_proba(X_te_sub)[:, 1]
        m2 = evaluate(f'XGB — {config_name}', y_test, prob_xgb)
        m2['config'] = config_name
        m2['model_type'] = 'XGBoost'
        m2['n_features'] = len(feats)
        ablation_results.append(m2)
    
    # LSTM ablation
    X_seq_tr_sub = X_seq_train[:, :, feat_idx]
    X_seq_val_sub = X_seq_val[:, :, feat_idx]
    X_seq_te_sub = X_seq_test[:, :, feat_idx]
    
    lstm_abl = LSTMClassifier(input_dim=len(feats)).to(device)
    lstm_abl, _ = train_torch_model(
        lstm_abl, X_seq_tr_sub, y_seq_train, X_seq_val_sub, y_seq_val,
        epochs=60, batch_size=64, lr=0.001, patience=10)
    prob_lstm = get_torch_probs(lstm_abl, X_seq_te_sub)
    m3 = evaluate(f'LSTM — {config_name}', y_seq_test, prob_lstm)
    m3['config'] = config_name
    m3['model_type'] = 'LSTM'
    m3['n_features'] = len(feats)
    ablation_results.append(m3)
    
    print(f'  {config_name} ({len(feats)} feats): GB AUROC={m["AUROC"]:.3f} | LSTM AUROC={m3["AUROC"]:.3f}')

ablation_df = pd.DataFrame(ablation_results)
print('\nFull ablation results:')
display(ablation_df[['model', 'config', 'model_type', 'n_features', 'AUROC', 'AUPRC', 'Brier', 'Accuracy']])

In [None]:
# ─── 7B: Ablation visualization ───
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Grouped bar chart: AUROC by model type and feature config
pivot = ablation_df.pivot_table(index='config', columns='model_type', values='AUROC')
pivot = pivot.reindex(['All Features', 'In-Situ Only', 'Satellite Only'])
pivot.plot.bar(ax=axes[0], edgecolor='white', width=0.7)
axes[0].set_title('AUROC by Feature Set & Model Type', fontweight='bold')
axes[0].set_ylabel('AUROC')
axes[0].set_xlabel('Feature Configuration')
axes[0].tick_params(axis='x', rotation=0)
axes[0].axhline(0.5, color='gray', linestyle=':', alpha=0.5)
axes[0].legend(title='Model')

# Degradation plot: how much does each model lose when dropping modalities?
deg_data = []
for model_type in ablation_df['model_type'].unique():
    sub = ablation_df[ablation_df['model_type'] == model_type]
    all_auroc = sub[sub['config'] == 'All Features']['AUROC'].values
    if len(all_auroc) == 0:
        continue
    all_auroc = all_auroc[0]
    for _, row in sub.iterrows():
        if pd.notna(row['AUROC']):
            deg_data.append({
                'model': model_type,
                'config': row['config'],
                'degradation': (all_auroc - row['AUROC']) / max(all_auroc, 0.01) * 100
            })

deg_df = pd.DataFrame(deg_data)
deg_pivot = deg_df.pivot_table(index='config', columns='model', values='degradation')
deg_pivot = deg_pivot.reindex(['All Features', 'In-Situ Only', 'Satellite Only'])
deg_pivot.plot.bar(ax=axes[1], edgecolor='white', width=0.7)
axes[1].set_title('AUROC Degradation When Dropping Modality (%)', fontweight='bold')
axes[1].set_ylabel('% Degradation from All Features')
axes[1].set_xlabel('Feature Configuration')
axes[1].tick_params(axis='x', rotation=0)
axes[1].axhline(0, color='gray', linestyle='-', alpha=0.3)
axes[1].legend(title='Model')

plt.suptitle('RQ1: Modality Ablation — How Models Handle Missing Data Sources',
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'figures' / 'fig3_modality_ablation.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nKey insight for RQ1:')
print('Compare these degradation patterns to how SWIM agents handle the same dropout.')
print('If agents degrade less, that supports the agentic architecture thesis.')

---
## 8. Random Feature Dropout Experiment (RQ1)
Simulate real-world sensor failures: randomly drop features at test time

In [None]:
# ─── 8A: Feature dropout at varying rates ───
dropout_rates = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
n_trials = 10  # average over random trials

dropout_results = []

for drop_rate in dropout_rates:
    for trial in range(n_trials):
        # Create dropout mask
        mask = np.random.random(X_test.shape) > drop_rate
        X_test_dropped = X_test * mask  # zero out dropped features
        
        # GB
        prob_gb = tabular_models['GradientBoosting'].predict_proba(X_test_dropped)[:, 1]
        m_gb = evaluate('GB', y_test, prob_gb)
        dropout_results.append({'model': 'GradientBoosting', 'drop_rate': drop_rate,
                                'trial': trial, 'AUROC': m_gb['AUROC']})
        
        # RF
        prob_rf = tabular_models['RandomForest'].predict_proba(X_test_dropped)[:, 1]
        m_rf = evaluate('RF', y_test, prob_rf)
        dropout_results.append({'model': 'RandomForest', 'drop_rate': drop_rate,
                                'trial': trial, 'AUROC': m_rf['AUROC']})
        
        if HAS_XGB:
            prob_xgb = tabular_models['XGBoost'].predict_proba(X_test_dropped)[:, 1]
            m_xgb = evaluate('XGB', y_test, prob_xgb)
            dropout_results.append({'model': 'XGBoost', 'drop_rate': drop_rate,
                                    'trial': trial, 'AUROC': m_xgb['AUROC']})
        
        # LSTM dropout
        seq_mask = np.random.random(X_seq_test.shape) > drop_rate
        X_seq_dropped = X_seq_test * seq_mask
        prob_lstm = get_torch_probs(lstm_model, X_seq_dropped)
        m_lstm = evaluate('LSTM', y_seq_test, prob_lstm)
        dropout_results.append({'model': 'LSTM', 'drop_rate': drop_rate,
                                'trial': trial, 'AUROC': m_lstm['AUROC']})

dropout_df = pd.DataFrame(dropout_results)
print(f'Dropout experiment: {len(dropout_df)} evaluations')
print(dropout_df.groupby(['model', 'drop_rate'])['AUROC'].mean().unstack().round(3))

In [None]:
# ─── 8B: Dropout degradation curves ───
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# AUROC vs dropout rate
for model_name in dropout_df['model'].unique():
    sub = dropout_df[dropout_df['model'] == model_name]
    means = sub.groupby('drop_rate')['AUROC'].mean()
    stds = sub.groupby('drop_rate')['AUROC'].std()
    axes[0].plot(means.index, means.values, 'o-', label=model_name, linewidth=2, markersize=6)
    axes[0].fill_between(means.index, means - stds, means + stds, alpha=0.15)

axes[0].set_xlabel('Feature Dropout Rate')
axes[0].set_ylabel('AUROC')
axes[0].set_title('Model Robustness Under Feature Dropout', fontweight='bold')
axes[0].axhline(0.5, color='gray', linestyle=':', alpha=0.5, label='Random')
axes[0].legend()
axes[0].set_xlim(-0.02, 0.72)

# Relative degradation
for model_name in dropout_df['model'].unique():
    sub = dropout_df[dropout_df['model'] == model_name]
    means = sub.groupby('drop_rate')['AUROC'].mean()
    baseline = means.iloc[0]  # no dropout
    relative = (means - baseline) / max(baseline, 0.01) * 100
    axes[1].plot(relative.index, relative.values, 'o-', label=model_name, linewidth=2, markersize=6)

axes[1].set_xlabel('Feature Dropout Rate')
axes[1].set_ylabel('AUROC Change (%)')
axes[1].set_title('Relative AUROC Degradation', fontweight='bold')
axes[1].axhline(0, color='gray', linestyle='-', alpha=0.3)
axes[1].legend()
axes[1].set_xlim(-0.02, 0.72)

plt.suptitle('RQ1: Feature Dropout Robustness (Monolithic Models)',
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'figures' / 'fig4_feature_dropout.png', dpi=150, bbox_inches='tight')
plt.show()

print('\nThis is the MONOLITHIC baseline for RQ1.')
print('In Phase 3, we compare these curves to the SWIM agentic architecture.')
print('If agents maintain higher AUROC under dropout → paper contribution.')

---
## 9. Save Everything & Export ZIP

In [None]:
# ─── 9A: Save all results ───
results_df.to_csv(RESULTS_DIR / 'all_models_test_results.csv', index=False)
ablation_df.to_csv(RESULTS_DIR / 'ablation_results.csv', index=False)
dropout_df.to_csv(RESULTS_DIR / 'dropout_experiment.csv', index=False)

# Save models
torch.save(lstm_model.state_dict(), RESULTS_DIR / 'models' / 'lstm_model.pt')
torch.save(transformer_model.state_dict(), RESULTS_DIR / 'models' / 'transformer_model.pt')
for name, model in tabular_models.items():
    with open(RESULTS_DIR / 'models' / f'{name.lower()}.pkl', 'wb') as f:
        pickle.dump(model, f)

# Save histories
with open(RESULTS_DIR / 'lstm_history.json', 'w') as f:
    json.dump(lstm_history, f)
with open(RESULTS_DIR / 'transformer_history.json', 'w') as f:
    json.dump(transformer_history, f)

# Experiment metadata
meta = {
    'experiment': 'RQ1_Phase2_Deep_Learning_Baselines',
    'date': datetime.now().isoformat(),
    'device': str(device),
    'gpu': torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU',
    'dataset': {
        'total': len(research_df),
        'train_flat': len(y_train),
        'test_flat': len(y_test),
        'train_seq': len(y_seq_train),
        'test_seq': len(y_seq_test),
        'seq_len': SEQ_LEN,
        'n_features': len(ALL_FEATURES),
        'features': ALL_FEATURES,
    },
    'models': {
        'LSTM': {'hidden': 64, 'layers': 2, 'dropout': 0.3},
        'Transformer': {'d_model': 64, 'heads': 4, 'layers': 2, 'dropout': 0.3},
        'GradientBoosting': {'n_estimators': 200, 'max_depth': 5},
        'RandomForest': {'n_estimators': 200, 'max_depth': 10},
    },
    'test_results': results_df.to_dict(orient='records'),
    'ablation_summary': ablation_df[['model', 'config', 'AUROC', 'AUPRC']].to_dict(orient='records'),
}

with open(RESULTS_DIR / 'experiment_metadata.json', 'w') as f:
    json.dump(meta, f, indent=2, default=str)

print('All results saved.')

In [None]:
# ─── 9B: Create ZIP ───
zip_path = 'swim_research_phase2_results.zip'

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
    for root, dirs, files in os.walk(RESULTS_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, '.')
            zf.write(file_path, arcname)

zip_size = os.path.getsize(zip_path) / (1024 * 1024)
print(f'\nResults ZIP: {zip_path} ({zip_size:.1f} MB)')
print('\nContents:')
with zipfile.ZipFile(zip_path, 'r') as zf:
    for info in zf.infolist():
        print(f'  {info.filename:<55s} {info.file_size/1024:>8.1f} KB')

if IN_COLAB:
    from google.colab import files
    files.download(zip_path)
else:
    print(f'\nDownload from: {os.path.abspath(zip_path)}')

---
## Summary

### Models Trained:
| Model | Type | Data Format |
|-------|------|-------------|
| GradientBoosting | Tabular | Flat features |
| RandomForest | Tabular | Flat features |
| XGBoost | Tabular | Flat features |
| LSTM (2-layer) | Sequence | 7-day windows |
| Transformer | Sequence | 7-day windows |

### Experiments Run:
1. Full model comparison (AUROC, AUPRC, Brier, Accuracy)
2. Modality ablation (in-situ only, satellite only, all)
3. Feature dropout robustness (0-70% random dropout, 10 trials each)

### Next Steps:
- **Phase 3**: Run same data through SWIM agents, compare dropout curves
- **Phase 4**: Communication protocol experiments (RQ2)
- **Phase 5**: Conflict resolution experiments (RQ3)