# Phase 1: Statistical Baselines & Feature Distributions

**Question:** Can simple graph-level statistics already predict hallucinations? How much does the GNN add?

This notebook:
1. Extracts graph-level features from all 486 samples
2. Runs distribution analysis (violin plots, Mann-Whitney U, Cohen's d)
3. Fits logistic regression and random forest baselines (5-fold stratified CV)
4. Computes mutual information between features and labels
5. Compares baseline ROC curves against GNN performance

In [None]:
import sys
sys.path.insert(0, '../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from calamr_interp.utils.data_loading import load_dataset
from calamr_interp.utils.visualization import setup_style, violin_comparison, roc_curves, COLORS
from calamr_interp.phase1_baselines import (
    GraphFeatureExtractor,
    StatisticalBaseline,
    distribution_analysis,
)

setup_style()
print("Imports OK")

## 1. Load Data & Extract Features

In [None]:
# Load all 486 graphs
dataset = load_dataset()
print(f"Loaded {len(dataset)} graphs")
print(f"Labels: {sum(d.y.item() for d in dataset)} hallucination, {sum(1-d.y.item() for d in dataset)} truth")

# Extract features
extractor = GraphFeatureExtractor()
features_df, labels = extractor.extract_batch(dataset)
print(f"\nExtracted {len(features_df.columns)} features from {len(features_df)} graphs")
features_df.describe()

## 2. Distribution Analysis

In [None]:
# Run statistical tests
dist_results = distribution_analysis(features_df, labels)
dist_results

In [None]:
# Violin plots for all features
data_dict = {}
for col in features_df.columns:
    data_dict[col] = {
        0: features_df.loc[labels == 0, col].values,
        1: features_df.loc[labels == 1, col].values,
    }

fig = violin_comparison(
    data_dict,
    title="Feature Distributions: Truth vs Hallucination",
    ncols=4,
)
plt.show()

In [None]:
# Highlight statistically significant features (p < 0.05)
significant = dist_results[dist_results['mann_whitney_p'] < 0.05]
print(f"{len(significant)} features are statistically significant (p < 0.05):")
print(significant[['feature', 'mann_whitney_p', 'cohens_d', 'mutual_info']].to_string(index=False))

## 3. Baseline Classifiers

In [None]:
# Run 5-fold stratified CV
X = features_df.values
y = labels

baseline = StatisticalBaseline(seed=42)
results = baseline.evaluate(X, y)

for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"  F1:       {metrics['test_f1_mean']:.3f} +/- {metrics['test_f1_std']:.3f}")
    print(f"  Accuracy: {metrics['test_accuracy_mean']:.3f} +/- {metrics['test_accuracy_std']:.3f}")
    print(f"  AUC:      {metrics['test_auc_mean']:.3f} +/- {metrics['test_auc_std']:.3f}")

In [None]:
# Feature importance from Random Forest
importance_df = baseline.feature_importance(X, y, list(features_df.columns))

fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=importance_df, x='importance', y='feature', ax=ax, color=COLORS['primary'])
ax.set_title('Random Forest Feature Importance')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.show()

## 4. Mutual Information

In [None]:
# Mutual information already in dist_results
mi_df = dist_results[['feature', 'mutual_info']].sort_values('mutual_info', ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(data=mi_df, x='mutual_info', y='feature', ax=ax, color=COLORS['secondary'])
ax.set_title('Mutual Information: Feature vs Label')
ax.set_xlabel('Mutual Information (nats)')
plt.tight_layout()
plt.show()

## 5. ROC Curve Comparison

In [None]:
# Get ROC curves for baselines
curves = baseline.get_roc_curves(X, y)

# Plot
fig = roc_curves(curves, title='Baseline ROC Curves')
plt.show()

print("\n--- Key Takeaway ---")
print("Compare these AUCs against the GNN's AUC to quantify")
print("the 'value of graph structure' beyond aggregate statistics.")