# SWIM Research — Phase 3: Agentic vs Monolithic (RQ1/RQ2/RQ3)

**RQ1**: Does agent specialization improve robustness under sensor failure?
**RQ2**: Does inter-agent communication improve predictions?
**RQ3**: How should the orchestrator resolve agent conflicts?

### Experiments:
1. Train specialist agents (CALIBRO=in-situ, VISIOS=satellite)
2. Compare monolithic vs agentic under feature dropout
3. Selective modality failure (satellite down, in-situ down)
4. Communication protocols (independent, sharing, gated)
5. Conflict resolution strategies
6. Statistical significance tests

In [None]:
# ─── Setup + Data ───
import os, sys, json, pickle, zipfile, warnings, glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (roc_auc_score, average_precision_score, brier_score_loss,
                             roc_curve, precision_recall_curve)
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from scipy import stats
warnings.filterwarnings('ignore')
np.random.seed(42)
try:
    plt.style.use('seaborn-v0_8-whitegrid')
except:
    plt.style.use('ggplot')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

RESULTS_DIR = Path('results_phase3')
RESULTS_DIR.mkdir(exist_ok=True)
(RESULTS_DIR / 'figures').mkdir(exist_ok=True)

# Load data
research_df = None
for p in glob.glob('/home/jovyan/**/unified_research_dataset.parquet', recursive=True):
    try:
        research_df = pd.read_parquet(p)
        print(f'Loaded: {p}')
        break
    except:
        pass
if research_df is None:
    raise FileNotFoundError('unified_research_dataset.parquet not found')

INSITU_FEATURES = ['chlorophyll_a','turbidity','dissolved_oxygen','ph',
                   'temperature','conductivity','wind_speed','air_temperature','humidity']
SATELLITE_FEATURES = ['ndvi','surface_temperature','chlorophyll_index',
                      'turbidity_index','cloud_coverage']
ALL_FEATURES = [f for f in INSITU_FEATURES + SATELLITE_FEATURES if f in research_df.columns]
INSITU_AVAIL = [f for f in INSITU_FEATURES if f in ALL_FEATURES]
SAT_AVAIL = [f for f in SATELLITE_FEATURES if f in ALL_FEATURES]
TARGET = 'bloom_label'
INSITU_IDX = [ALL_FEATURES.index(f) for f in INSITU_AVAIL]
SAT_IDX = [ALL_FEATURES.index(f) for f in SAT_AVAIL]

X_all = research_df[ALL_FEATURES].values
y_all = research_df[TARGET].values
imp = SimpleImputer(strategy='median')
sc = StandardScaler()
X_clean = sc.fit_transform(imp.fit_transform(X_all))

X_train, X_temp, y_train, y_temp = train_test_split(
    X_clean, y_all, test_size=0.3, random_state=42, stratify=y_all)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f'Dataset: {len(research_df):,} rows | {research_df["lake"].nunique()} lakes')
print(f'Features: {len(ALL_FEATURES)} ({len(INSITU_AVAIL)} in-situ + {len(SAT_AVAIL)} satellite)')
print(f'Train={len(y_train)} Val={len(y_val)} Test={len(y_test)}')
print(f'Bloom: train={y_train.mean():.3f} val={y_val.mean():.3f} test={y_test.mean():.3f}')
print('Setup complete.')

---
## 1. Agent & Orchestrator Architecture

In [None]:
# ─── Agent & Orchestrator Classes ───
class SpecialistAgent:
    def __init__(self, name, feat_idx, all_names):
        self.name = name
        self.feat_idx = feat_idx
        self.feat_names = [all_names[i] for i in feat_idx]
        self.model = GradientBoostingClassifier(
            n_estimators=200, max_depth=5, learning_rate=0.1,
            subsample=0.8, min_samples_leaf=10, random_state=42)
        self.val_auroc = 0.5

    def train(self, X_tr, y_tr, X_va, y_va):
        self.model.fit(X_tr[:, self.feat_idx], y_tr)
        vp = self.model.predict_proba(X_va[:, self.feat_idx])[:, 1]
        if len(np.unique(y_va)) > 1:
            self.val_auroc = roc_auc_score(y_va, vp)
        print(f'  {self.name}: {len(self.feat_idx)} feats, val AUROC={self.val_auroc:.3f}')

    def predict(self, X):
        prob = self.model.predict_proba(X[:, self.feat_idx])[:, 1]
        conf = np.abs(prob - 0.5) * 2
        return prob, conf

    def predict_prob(self, X):
        return self.predict(X)[0]


class Orchestrator:
    def __init__(self, agents):
        self.agents = agents
        self.stack_model = None

    def predict(self, X, strategy='confidence_weighted'):
        preds, confs = [], []
        for a in self.agents:
            p, c = a.predict(X)
            preds.append(p)
            confs.append(c)
        preds = np.array(preds)
        confs = np.array(confs)
        eps = 1e-8

        if strategy == 'simple_average':
            return np.mean(preds, axis=0)
        elif strategy == 'weighted_static':
            w = np.array([a.val_auroc for a in self.agents])
            return np.average(preds, axis=0, weights=w/w.sum())
        elif strategy == 'confidence_weighted':
            w = confs / (confs.sum(axis=0, keepdims=True) + eps)
            return (preds * w).sum(axis=0)
        elif strategy == 'entropy_weighted':
            ent = -(preds*np.log(preds+eps) + (1-preds)*np.log(1-preds+eps))
            inv = 1.0/(ent+eps)
            w = inv / (inv.sum(axis=0, keepdims=True)+eps)
            return (preds * w).sum(axis=0)
        elif strategy == 'conflict_aware':
            std = preds.std(axis=0)
            sharp = 1.0 + 5.0 * std
            sc = confs ** sharp[np.newaxis, :]
            w = sc / (sc.sum(axis=0, keepdims=True) + eps)
            return (preds * w).sum(axis=0)
        elif strategy == 'stacking':
            if self.stack_model is None:
                raise ValueError('Call train_stacking first')
            return self.stack_model.predict_proba(preds.T)[:, 1]

    def train_stacking(self, X_va, y_va):
        feats = np.array([a.predict_prob(X_va) for a in self.agents]).T
        self.stack_model = LogisticRegression(random_state=42)
        self.stack_model.fit(feats, y_va)
        s = roc_auc_score(y_va, self.stack_model.predict_proba(feats)[:, 1])
        print(f'  Stacking meta-learner: val AUROC={s:.3f}')


def evaluate(name, y_true, y_prob):
    yp = (y_prob >= 0.5).astype(int)
    both = len(np.unique(y_true)) > 1
    return {
        'model': name,
        'AUROC': roc_auc_score(y_true, y_prob) if both else np.nan,
        'AUPRC': average_precision_score(y_true, y_prob) if both else np.nan,
        'Brier': brier_score_loss(y_true, y_prob),
        'Accuracy': (yp == y_true).mean(),
    }

print('Classes defined.')

---
## 2. Train Monolithic Baselines + Specialist Agents

In [None]:
# ─── Train Everything ───
print('=== MONOLITHIC BASELINES ===')
mono_gb = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                     subsample=0.8, min_samples_leaf=10, random_state=42)
mono_gb.fit(X_train, y_train)
mono_rf = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_leaf=5,
                                 random_state=42, n_jobs=-1)
mono_rf.fit(X_train, y_train)
print(f'  Mono GB:  AUROC={roc_auc_score(y_test, mono_gb.predict_proba(X_test)[:,1]):.3f}')
print(f'  Mono RF:  AUROC={roc_auc_score(y_test, mono_rf.predict_proba(X_test)[:,1]):.3f}')

print('\n=== SPECIALIST AGENTS ===')
calibro = SpecialistAgent('CALIBRO (In-Situ)', INSITU_IDX, ALL_FEATURES)
calibro.train(X_train, y_train, X_val, y_val)
visios = SpecialistAgent('VISIOS (Satellite)', SAT_IDX, ALL_FEATURES)
visios.train(X_train, y_train, X_val, y_val)

print('\n=== ORCHESTRATOR ===')
orch = Orchestrator([calibro, visios])
orch.train_stacking(X_val, y_val)

# ─── Clean comparison ───
print('\n' + '='*70)
print('CLEAN TEST SET: MONOLITHIC vs AGENTIC')
print('='*70)
clean_results = []
clean_results.append(evaluate('Monolithic GB', y_test, mono_gb.predict_proba(X_test)[:,1]))
clean_results.append(evaluate('Monolithic RF', y_test, mono_rf.predict_proba(X_test)[:,1]))
mono_ens = (mono_gb.predict_proba(X_test)[:,1] + mono_rf.predict_proba(X_test)[:,1]) / 2
clean_results.append(evaluate('Monolithic Ensemble', y_test, mono_ens))
for strat in ['simple_average','weighted_static','confidence_weighted',
              'entropy_weighted','conflict_aware','stacking']:
    p = orch.predict(X_test, strategy=strat)
    clean_results.append(evaluate(f'Agentic-{strat}', y_test, p))
clean_df = pd.DataFrame(clean_results).sort_values('AUROC', ascending=False)
print(clean_df.to_string(index=False))

# Figure
fig, ax = plt.subplots(figsize=(12, 6))
colors = ['#2196F3' if 'Mono' in m else '#FF5722' for m in clean_df['model']]
bars = ax.barh(clean_df['model'], clean_df['AUROC'], color=colors, edgecolor='white')
ax.set_xlabel('AUROC')
ax.set_title('Clean Data: Monolithic vs Agentic', fontweight='bold', fontsize=13)
ax.axvline(0.5, color='gray', linestyle=':', alpha=0.5)
for b, v in zip(bars, clean_df['AUROC']):
    ax.text(v+0.003, b.get_y()+b.get_height()/2, f'{v:.3f}', va='center', fontsize=9)
ax.legend(handles=[plt.Rectangle((0,0),1,1,fc='#2196F3'), plt.Rectangle((0,0),1,1,fc='#FF5722')],
          labels=['Monolithic','Agentic'], loc='lower right')
plt.tight_layout()
plt.savefig(RESULTS_DIR/'figures'/'fig1_clean_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

---
## 3. RQ1: Feature Dropout — Monolithic vs Agentic (KEY EXPERIMENT)
Simulate random sensor failures at varying rates.

In [None]:
# ─── RQ1: Feature Dropout ───
print('Running feature dropout experiment (this is the key RQ1 result)...')
dropout_rates = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
n_trials = 20
strats_test = ['simple_average', 'confidence_weighted', 'conflict_aware']

dropout_results = []
for rate in dropout_rates:
    for trial in range(n_trials):
        mask = np.random.random(X_test.shape) > rate
        X_drop = X_test * mask
        # Monolithic
        for name, mdl in [('Monolithic GB', mono_gb), ('Monolithic RF', mono_rf)]:
            try:
                a = roc_auc_score(y_test, mdl.predict_proba(X_drop)[:,1])
            except:
                a = np.nan
            dropout_results.append({'model': name, 'rate': rate, 'trial': trial, 'AUROC': a})
        # Agentic
        for strat in strats_test:
            try:
                a = roc_auc_score(y_test, orch.predict(X_drop, strategy=strat))
            except:
                a = np.nan
            dropout_results.append({'model': f'Agentic ({strat})', 'rate': rate, 'trial': trial, 'AUROC': a})
    print(f'  Dropout {rate:.0%} done')

dropout_df = pd.DataFrame(dropout_results)

# Visualization
mc = {'Monolithic GB':'#2196F3', 'Monolithic RF':'#1976D2',
      'Agentic (simple_average)':'#FF9800',
      'Agentic (confidence_weighted)':'#FF5722',
      'Agentic (conflict_aware)':'#E91E63'}

fig, axes = plt.subplots(1, 2, figsize=(20, 7))
for mn in dropout_df['model'].unique():
    sub = dropout_df[dropout_df['model']==mn]
    mu = sub.groupby('rate')['AUROC'].mean()
    sd = sub.groupby('rate')['AUROC'].std()
    c = mc.get(mn, 'gray')
    ls = '--' if 'Mono' in mn else '-'
    lw = 1.5 if 'Mono' in mn else 2.5
    axes[0].plot(mu.index, mu.values, f'{ls}o', color=c, label=mn, linewidth=lw, markersize=5)
    axes[0].fill_between(mu.index, mu-sd, mu+sd, alpha=0.1, color=c)
axes[0].set_xlabel('Feature Dropout Rate', fontsize=12)
axes[0].set_ylabel('AUROC', fontsize=12)
axes[0].set_title('RQ1: Model Robustness Under Sensor Failure', fontweight='bold')
axes[0].axhline(0.5, color='gray', linestyle=':', alpha=0.5)
axes[0].legend(fontsize=9)

for mn in dropout_df['model'].unique():
    sub = dropout_df[dropout_df['model']==mn]
    mu = sub.groupby('rate')['AUROC'].mean()
    bl = mu.iloc[0]
    rel = (mu - bl) / max(bl, 0.01) * 100
    c = mc.get(mn, 'gray')
    ls = '--' if 'Mono' in mn else '-'
    lw = 1.5 if 'Mono' in mn else 2.5
    axes[1].plot(rel.index, rel.values, f'{ls}o', color=c, label=mn, linewidth=lw, markersize=5)
axes[1].set_xlabel('Feature Dropout Rate', fontsize=12)
axes[1].set_ylabel('AUROC Change (%)', fontsize=12)
axes[1].set_title('Relative Degradation', fontweight='bold')
axes[1].axhline(0, color='gray', linestyle='-', alpha=0.3)
axes[1].legend(fontsize=9)

plt.suptitle('RQ1: Agentic Specialization vs Monolithic Under Feature Dropout',
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(RESULTS_DIR/'figures'/'fig2_dropout_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

summary = dropout_df.groupby(['model','rate'])['AUROC'].mean().unstack()
print('\nMean AUROC by dropout rate:')
print(summary.round(3).to_string())

---
## 4. Selective Modality Failure
What happens when an ENTIRE data source fails?

In [None]:
# ─── Selective Modality Failure ───
print('Testing complete modality failures...')
X_sat_fail = X_test.copy()
X_sat_fail[:, SAT_IDX] = 0
X_insitu_fail = X_test.copy()
X_insitu_fail[:, INSITU_IDX] = 0

scenarios = {'Normal': X_test, 'Satellite Down': X_sat_fail, 'In-Situ Down': X_insitu_fail}
sel_results = []
for sname, X_sc in scenarios.items():
    for mname, mdl in [('Monolithic GB', mono_gb), ('Monolithic RF', mono_rf)]:
        a = roc_auc_score(y_test, mdl.predict_proba(X_sc)[:,1])
        sel_results.append({'scenario': sname, 'model': mname, 'AUROC': a})
    for strat in ['simple_average','confidence_weighted','conflict_aware']:
        a = roc_auc_score(y_test, orch.predict(X_sc, strategy=strat))
        sel_results.append({'scenario': sname, 'model': f'Agentic ({strat})', 'AUROC': a})

sel_df = pd.DataFrame(sel_results)

fig, axes = plt.subplots(1, 2, figsize=(18, 6))
pivot = sel_df.pivot_table(index='scenario', columns='model', values='AUROC')
pivot = pivot.reindex(['Normal','Satellite Down','In-Situ Down'])
pivot.plot.bar(ax=axes[0], edgecolor='white', width=0.8)
axes[0].set_title('AUROC Under Modality Failure', fontweight='bold')
axes[0].set_ylabel('AUROC')
axes[0].tick_params(axis='x', rotation=0)
axes[0].axhline(0.5, color='gray', linestyle=':', alpha=0.5)
axes[0].legend(fontsize=7, ncol=2)

normal = sel_df[sel_df['scenario']=='Normal'].set_index('model')['AUROC']
deg = []
for _, r in sel_df.iterrows():
    if r['scenario'] != 'Normal':
        n = normal[r['model']]
        deg.append({'scenario': r['scenario'], 'model': r['model'], 'deg': (n-r['AUROC'])/n*100})
deg_df = pd.DataFrame(deg)
dp = deg_df.pivot_table(index='scenario', columns='model', values='deg')
dp.plot.bar(ax=axes[1], edgecolor='white', width=0.8)
axes[1].set_title('% AUROC Degradation From Normal', fontweight='bold')
axes[1].set_ylabel('% Degradation')
axes[1].tick_params(axis='x', rotation=0)
axes[1].axhline(0, color='gray', linestyle='-', alpha=0.3)
axes[1].legend(fontsize=7, ncol=2)

plt.suptitle('RQ1: Agentic Resilience Under Complete Modality Failure', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR/'figures'/'fig3_selective_failure.png', dpi=150, bbox_inches='tight')
plt.show()

print(sel_df.pivot_table(index='model', columns='scenario', values='AUROC').round(3).to_string())

---
## 5. RQ2: Inter-Agent Communication Protocols
- **P0**: Independent agents (no communication)
- **P1**: Prediction sharing (agents see each other's output)
- **P2**: Confidence-gated (only share when confident)

In [None]:
# ─── RQ2: Communication Protocols ───
print('Training communicated agents...')

# Get independent predictions on train/val sets
cal_tr = calibro.predict_prob(X_train)
vis_tr = visios.predict_prob(X_train)
cal_va = calibro.predict_prob(X_val)
vis_va = visios.predict_prob(X_val)

# P1: Prediction sharing — each agent gets other's prediction as extra feature
X_tr_cal_comm = np.column_stack([X_train[:, INSITU_IDX], vis_tr])
X_tr_vis_comm = np.column_stack([X_train[:, SAT_IDX], cal_tr])

cal_comm = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                      subsample=0.8, random_state=42)
cal_comm.fit(X_tr_cal_comm, y_train)
vis_comm = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                                      subsample=0.8, random_state=42)
vis_comm.fit(X_tr_vis_comm, y_train)
print('  P1 agents trained')

# Test under dropout
dr_rq2 = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
n_t = 15
conf_thresh = 0.3
comm_results = []

for rate in dr_rq2:
    for trial in range(n_t):
        mask = np.random.random(X_test.shape) > rate
        X_d = X_test * mask

        # P0: Independent
        p0 = orch.predict(X_d, strategy='confidence_weighted')

        # P1: Full sharing
        c1 = calibro.predict_prob(X_d)
        v1 = visios.predict_prob(X_d)
        Xc1 = np.column_stack([X_d[:, INSITU_IDX], v1])
        Xv1 = np.column_stack([X_d[:, SAT_IDX], c1])
        p1 = (cal_comm.predict_proba(Xc1)[:,1] + vis_comm.predict_proba(Xv1)[:,1]) / 2

        # P2: Confidence-gated sharing
        cc = np.abs(c1 - 0.5) * 2
        vc = np.abs(v1 - 0.5) * 2
        v_gate = np.where(vc > conf_thresh, v1, 0.5)
        c_gate = np.where(cc > conf_thresh, c1, 0.5)
        Xc2 = np.column_stack([X_d[:, INSITU_IDX], v_gate])
        Xv2 = np.column_stack([X_d[:, SAT_IDX], c_gate])
        p2 = (cal_comm.predict_proba(Xc2)[:,1] + vis_comm.predict_proba(Xv2)[:,1]) / 2

        # Monolithic ref
        pm = mono_gb.predict_proba(X_d)[:,1]

        for nm, pr in [('Monolithic GB', pm), ('P0: Independent', p0),
                       ('P1: Pred Sharing', p1), ('P2: Conf-Gated', p2)]:
            try:
                a = roc_auc_score(y_test, pr)
            except:
                a = np.nan
            comm_results.append({'protocol': nm, 'rate': rate, 'trial': trial, 'AUROC': a})
    print(f'  Dropout {rate:.0%} done')

comm_df = pd.DataFrame(comm_results)

pc = {'Monolithic GB':'#2196F3', 'P0: Independent':'#FF9800',
      'P1: Pred Sharing':'#FF5722', 'P2: Conf-Gated':'#E91E63'}

fig, axes = plt.subplots(1, 2, figsize=(18, 6))
for pr in comm_df['protocol'].unique():
    sub = comm_df[comm_df['protocol']==pr]
    mu = sub.groupby('rate')['AUROC'].mean()
    sd = sub.groupby('rate')['AUROC'].std()
    c = pc.get(pr, 'gray')
    ls = '--' if 'Mono' in pr else '-'
    lw = 1.5 if 'Mono' in pr else 2.5
    axes[0].plot(mu.index, mu.values, f'{ls}o', color=c, label=pr, linewidth=lw, markersize=6)
    axes[0].fill_between(mu.index, mu-sd, mu+sd, alpha=0.1, color=c)
axes[0].set_xlabel('Feature Dropout Rate')
axes[0].set_ylabel('AUROC')
axes[0].set_title('RQ2: Communication Protocols Under Dropout', fontweight='bold')
axes[0].legend()
axes[0].axhline(0.5, color='gray', linestyle=':', alpha=0.5)

sub40 = comm_df[comm_df['rate']==0.4].groupby('protocol')['AUROC'].agg(['mean','std']).sort_values('mean')
bars = axes[1].barh(sub40.index, sub40['mean'], xerr=sub40['std'],
                    color=[pc.get(p,'gray') for p in sub40.index], edgecolor='white')
axes[1].set_xlabel('AUROC')
axes[1].set_title('At 40% Dropout: Protocol Comparison', fontweight='bold')
for b, (_, row) in zip(bars, sub40.iterrows()):
    axes[1].text(row['mean']+0.005, b.get_y()+b.get_height()/2, f'{row["mean"]:.3f}', va='center')
plt.suptitle('RQ2: Does Inter-Agent Communication Improve Robustness?', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR/'figures'/'fig4_communication.png', dpi=150, bbox_inches='tight')
plt.show()

print(comm_df.groupby(['protocol','rate'])['AUROC'].mean().unstack().round(3).to_string())

---
## 6. RQ3: Conflict Resolution Strategies
When agents disagree, which resolution strategy works best?

In [None]:
# ─── RQ3: Conflict Resolution ───
print('Conflict resolution experiment...')

def resolve(cal_p, vis_p, mask, strategy):
    cp, vp = cal_p[mask], vis_p[mask]
    if strategy == 'average':
        return (cp + vp) / 2
    elif strategy == 'trust_calibro':
        return cp
    elif strategy == 'trust_visios':
        return vp
    elif strategy == 'max_confidence':
        cc = np.abs(cp - 0.5)
        vc = np.abs(vp - 0.5)
        return np.where(cc > vc, cp, vp)
    elif strategy == 'weighted_conf':
        cc = np.abs(cp - 0.5) * 2 + 1e-8
        vc = np.abs(vp - 0.5) * 2 + 1e-8
        return cp * (cc/(cc+vc)) + vp * (vc/(cc+vc))
    elif strategy == 'entropy':
        eps = 1e-8
        ce = -(cp*np.log(cp+eps) + (1-cp)*np.log(1-cp+eps))
        ve = -(vp*np.log(vp+eps) + (1-vp)*np.log(1-vp+eps))
        cw, vw = 1/(ce+eps), 1/(ve+eps)
        return cp*(cw/(cw+vw)) + vp*(vw/(cw+vw))

strats_rq3 = ['average','trust_calibro','trust_visios','max_confidence','weighted_conf','entropy']
conf_results = []

for rate in [0.0, 0.2, 0.4, 0.6]:
    for trial in range(15):
        mask = np.random.random(X_test.shape) > rate
        X_d = X_test * mask
        cp = calibro.predict_prob(X_d)
        vp = visios.predict_prob(X_d)
        conflict = (cp >= 0.5).astype(int) != (vp >= 0.5).astype(int)
        if conflict.sum() < 5:
            continue
        yt = y_test[conflict]
        for s in strats_rq3:
            fused = resolve(cp, vp, conflict, s)
            both = len(np.unique(yt)) > 1
            a = roc_auc_score(yt, fused) if both else np.nan
            acc = ((fused >= 0.5).astype(int) == yt).mean()
            conf_results.append({'strategy': s, 'rate': rate, 'trial': trial,
                                 'AUROC': a, 'Accuracy': acc, 'n_conflicts': conflict.sum()})
    print(f'  Dropout {rate:.0%} done')

conf_df = pd.DataFrame(conf_results)

sc = {'average':'#9E9E9E','trust_calibro':'#2196F3','trust_visios':'#4CAF50',
      'max_confidence':'#FF9800','weighted_conf':'#FF5722','entropy':'#E91E63'}

fig, axes = plt.subplots(1, 2, figsize=(18, 6))
for s in strats_rq3:
    sub = conf_df[conf_df['strategy']==s]
    mu = sub.groupby('rate')['Accuracy'].mean()
    axes[0].plot(mu.index, mu.values, 'o-', color=sc.get(s,'gray'), label=s, linewidth=2, markersize=6)
axes[0].set_xlabel('Feature Dropout Rate')
axes[0].set_ylabel('Accuracy on Conflict Samples')
axes[0].set_title('RQ3: Conflict Resolution Accuracy', fontweight='bold')
axes[0].legend(fontsize=9)
axes[0].axhline(0.5, color='gray', linestyle=':', alpha=0.5)

counts = conf_df.groupby('rate')['n_conflicts'].mean()
axes[1].bar(counts.index, counts.values, width=0.15, color='#FF5722', edgecolor='white')
axes[1].set_xlabel('Feature Dropout Rate')
axes[1].set_ylabel('Mean # Conflicts')
axes[1].set_title('Conflicts Increase Under Dropout', fontweight='bold')

plt.suptitle('RQ3: Orchestrator Conflict Resolution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR/'figures'/'fig5_conflict_resolution.png', dpi=150, bbox_inches='tight')
plt.show()

print(conf_df.groupby(['strategy','rate'])['Accuracy'].mean().unstack().round(3).to_string())

---
## 7. Statistical Tests + Publication Figure

In [None]:
# ─── Statistical Significance ───
print('Statistical tests: Agentic (confidence_weighted) vs Monolithic GB\n')
sig_rows = []
for rate in [0.2, 0.4, 0.6]:
    m = dropout_df[(dropout_df['model']=='Monolithic GB')&(dropout_df['rate']==rate)]['AUROC'].values
    a = dropout_df[(dropout_df['model']=='Agentic (confidence_weighted)')&(dropout_df['rate']==rate)]['AUROC'].values
    n = min(len(m), len(a))
    if n > 5:
        stat, pv = stats.wilcoxon(a[:n], m[:n])
        sig_rows.append({'dropout': rate, 'mono': m.mean(), 'agentic': a.mean(),
                         'diff': a.mean()-m.mean(), 'pct': (a.mean()-m.mean())/m.mean()*100,
                         'p_value': pv, 'sig': pv < 0.05})
sig_df = pd.DataFrame(sig_rows)
print(sig_df.to_string(index=False))

# ─── 4-Panel Publication Figure ───
fig, axes = plt.subplots(2, 2, figsize=(20, 14))

# (a) Clean comparison
cs = clean_df[clean_df['model'].isin(['Monolithic GB','Monolithic RF',
     'Agentic-confidence_weighted','Agentic-conflict_aware'])].copy()
clr = ['#2196F3' if 'Mono' in m else '#FF5722' for m in cs['model']]
axes[0,0].barh(cs['model'], cs['AUROC'], color=clr, edgecolor='white')
axes[0,0].set_xlabel('AUROC')
axes[0,0].set_title('(a) Clean Data Performance', fontweight='bold')
for i, (_, r) in enumerate(cs.iterrows()):
    axes[0,0].text(r['AUROC']+0.001, i, f'{r["AUROC"]:.3f}', va='center')

# (b) Dropout curves
for mn in ['Monolithic GB','Monolithic RF','Agentic (confidence_weighted)','Agentic (conflict_aware)']:
    sub = dropout_df[dropout_df['model']==mn]
    mu = sub.groupby('rate')['AUROC'].mean()
    sd = sub.groupby('rate')['AUROC'].std()
    c = mc.get(mn, 'gray')
    ls = '--' if 'Mono' in mn else '-'
    lw = 1.5 if 'Mono' in mn else 2.5
    axes[0,1].plot(mu.index, mu.values, f'{ls}o', color=c, label=mn, linewidth=lw, markersize=5)
    axes[0,1].fill_between(mu.index, mu-sd, mu+sd, alpha=0.1, color=c)
axes[0,1].set_xlabel('Feature Dropout Rate')
axes[0,1].set_ylabel('AUROC')
axes[0,1].set_title('(b) RQ1: Robustness Under Dropout', fontweight='bold')
axes[0,1].legend(fontsize=8)
axes[0,1].axhline(0.5, color='gray', linestyle=':', alpha=0.5)

# (c) Communication
for pr in ['Monolithic GB','P0: Independent','P1: Pred Sharing','P2: Conf-Gated']:
    sub = comm_df[comm_df['protocol']==pr]
    mu = sub.groupby('rate')['AUROC'].mean()
    c = pc.get(pr, 'gray')
    ls = '--' if 'Mono' in pr else '-'
    lw = 1.5 if 'Mono' in pr else 2.5
    axes[1,0].plot(mu.index, mu.values, f'{ls}o', color=c, label=pr, linewidth=lw, markersize=6)
axes[1,0].set_xlabel('Feature Dropout Rate')
axes[1,0].set_ylabel('AUROC')
axes[1,0].set_title('(c) RQ2: Communication Protocols', fontweight='bold')
axes[1,0].legend(fontsize=8)

# (d) Conflict resolution
for s in ['average','max_confidence','weighted_conf','entropy']:
    sub = conf_df[conf_df['strategy']==s]
    mu = sub.groupby('rate')['Accuracy'].mean()
    axes[1,1].plot(mu.index, mu.values, 'o-', color=sc.get(s,'gray'), label=s, linewidth=2, markersize=6)
axes[1,1].set_xlabel('Feature Dropout Rate')
axes[1,1].set_ylabel('Accuracy on Conflicts')
axes[1,1].set_title('(d) RQ3: Conflict Resolution', fontweight='bold')
axes[1,1].legend(fontsize=8)

plt.suptitle('SWIM: Multi-Agent Architecture for Robust HAB Prediction',
             fontsize=16, fontweight='bold', y=1.01)
plt.tight_layout()
plt.savefig(RESULTS_DIR/'figures'/'fig6_publication_4panel.png', dpi=300, bbox_inches='tight')
plt.show()
print('Publication figure saved.')

---
## 8. Save & Export

In [None]:
# ─── Save Everything ───
clean_df.to_csv(RESULTS_DIR / 'clean_comparison.csv', index=False)
dropout_df.to_csv(RESULTS_DIR / 'dropout_experiment.csv', index=False)
sel_df.to_csv(RESULTS_DIR / 'selective_failure.csv', index=False)
comm_df.to_csv(RESULTS_DIR / 'communication_experiment.csv', index=False)
conf_df.to_csv(RESULTS_DIR / 'conflict_resolution.csv', index=False)
if len(sig_df) > 0:
    sig_df.to_csv(RESULTS_DIR / 'statistical_tests.csv', index=False)

meta = {
    'experiment': 'Phase3_Agentic_vs_Monolithic',
    'date': datetime.now().isoformat(),
    'RQs': ['RQ1: specialization','RQ2: communication','RQ3: conflict resolution'],
    'dataset': {'total': len(research_df), 'features': ALL_FEATURES,
                'insitu': INSITU_AVAIL, 'satellite': SAT_AVAIL},
    'agents': {'CALIBRO': {'feats': INSITU_AVAIL, 'val_auroc': calibro.val_auroc},
               'VISIOS': {'feats': SAT_AVAIL, 'val_auroc': visios.val_auroc}},
    'clean_results': clean_df.to_dict(orient='records'),
}
with open(RESULTS_DIR / 'experiment_metadata.json', 'w') as f:
    json.dump(meta, f, indent=2, default=str)

zip_path = 'swim_research_phase3_results.zip'
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
    for root, dirs, files in os.walk(RESULTS_DIR):
        for file in files:
            fp = os.path.join(root, file)
            zf.write(fp, os.path.relpath(fp, '.'))

zs = os.path.getsize(zip_path) / (1024*1024)
print(f'\nResults ZIP: {zip_path} ({zs:.1f} MB)')
with zipfile.ZipFile(zip_path, 'r') as zf:
    for i in zf.infolist():
        print(f'  {i.filename:<50s} {i.file_size/1024:>8.1f} KB')
print(f'\nDownload: {os.path.abspath(zip_path)}')

---
## Summary

### Key Results:
| RQ | Question | Experiment |
|----|----------|------------|
| RQ1 | Agent specialization vs monolithic | Feature dropout robustness curves |
| RQ2 | Inter-agent communication | Independent vs sharing vs gated |
| RQ3 | Conflict resolution | Average vs confidence vs entropy |

### Next Steps:
- **Phase 4**: Scale to more lakes (104 planned), longer time series
- **Phase 5**: Integrate with real SWIM A2A agents
- **Paper**: Write up results with publication figures