# Evolver Loop 2: Training Dynamics Analysis

## Goal
Analyze why ResNet50 with fine-tuning and TTA only achieved 2.4% improvement instead of expected 30-40%.

## Key Questions
1. Are the training dynamics healthy (convergence, overfitting, stability)?
2. Is the optimization configuration appropriate (LR, schedule, batch size)?
3. Are there data loading or augmentation issues?
4. What hyperparameters need tuning?

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load session state to understand experiment history
with open('/home/code/session_state.json', 'r') as f:
    session_state = json.load(f)

print("=== EXPERIMENT SUMMARY ===")
print(f"Total experiments: {len(session_state['experiments'])}")
print(f"Best CV score: {min([exp['score'] for exp in session_state['experiments']]):.4f}")
print(f"Baseline (exp_000): 0.0736")
print(f"Best (exp_006): 0.0718")
print(f"Improvement: {(0.0736 - 0.0718) / 0.0736 * 100:.1f}%")
print(f"Gap to gold: {0.0718 - 0.038820:.4f}")
print(f"Improvement needed: {(0.0718 - 0.038820) / 0.0718 * 100:.1f}%")

In [None]:
# Extract fold-level details from exp_006 (ResNet50 fine-tuning)
# Based on the notebook output we saw

fold_results = {
    'fold': [1, 2, 3, 4, 5],
    'final_val_loss': [0.0742, 0.0679, 0.0705, 0.0735, 0.0728],
    'best_val_loss': [0.0650, 0.0681, 0.0673, 0.0621, 0.0700],  # From training logs
    'epochs_trained': [4, 8, 4, 5, 7],  # When early stopping triggered
    'phase1_epochs': [3, 3, 3, 3, 3],  # Fixed at 3
    'phase2_epochs': [1, 5, 1, 2, 4]   # Total - phase1
}

df = pd.DataFrame(fold_results)
print("=== FOLD-LEVEL ANALYSIS ===")
print(df)
print()

print("=== KEY METRICS ===")
print(f"Mean final loss: {df['final_val_loss'].mean():.4f} ± {df['final_val_loss'].std():.4f}")
print(f"Mean best loss: {df['best_val_loss'].mean():.4f} ± {df['best_val_loss'].std():.4f}")
print(f"Mean epochs trained: {df['epochs_trained'].mean():.1f}")
print()

# Calculate degradation from best to final
df['degradation'] = df['final_val_loss'] - df['best_val_loss']
print(f"Mean degradation from best to final: {df['degradation'].mean():.4f}")
print(f"Max degradation: {df['degradation'].max():.4f} (fold {df.loc[df['degradation'].idxmax(), 'fold']})")
print()

# Check if early stopping is too aggressive
early_stopped_folds = sum(1 for epochs in df['epochs_trained'] if epochs < 8)
print(f"Folds that early stopped: {early_stopped_folds}/5")
print(f"Average Phase 2 epochs when early stopped: {df[df['epochs_trained'] < 8]['phase2_epochs'].mean():.1f}")