# Loop 9 Analysis: Understanding the exp_003 Gap and Next Steps

## Key Questions:
1. Why can't we reproduce exp_003's CV of 0.81951?
2. What's our true baseline CV?
3. What fundamentally different approaches should we try?

In [None]:
import pandas as pd
import numpy as np
import json

# Load session state
with open('/home/code/session_state.json', 'r') as f:
    state = json.load(f)

# Analyze all experiments
print("=== EXPERIMENT HISTORY ===")
for exp in state['experiments']:
    print(f"{exp['id']}: {exp['name'][:50]:50s} | CV: {exp['score']:.5f}")

In [None]:
# Analyze submissions and CV-LB relationship
print("\n=== SUBMISSION ANALYSIS ===")
submissions = state['submissions']
for sub in submissions:
    gap = sub['cv_score'] - sub['lb_score']
    gap_pct = gap / sub['cv_score'] * 100
    print(f"{sub['experiment_id']}: CV={sub['cv_score']:.5f}, LB={sub['lb_score']:.5f}, Gap={gap:.5f} ({gap_pct:.2f}%)")

# Calculate CV-LB model
cv_scores = [s['cv_score'] for s in submissions]
lb_scores = [s['lb_score'] for s in submissions]

from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(cv_scores, lb_scores)
print(f"\nLinear model: LB = {slope:.3f} * CV + {intercept:.3f} (RÂ²={r_value**2:.3f})")

# What CV do we need to beat best LB?
best_lb = max(lb_scores)
required_cv = (best_lb - intercept) / slope
print(f"\nTo beat best LB ({best_lb:.5f}): Need CV > {required_cv:.5f}")

In [None]:
# Analyze the exp_003 mystery
print("\n=== EXP_003 MYSTERY ===")
print("exp_003 CV: 0.81951")
print("exp_008 multi-seed results:")
print("  - Seed 42: CV = 0.81617")
print("  - Seed 123: CV = 0.81548")
print("  - Seed 456: CV = 0.81629")
print("  - Seed 789: CV = 0.81606")
print("  - Seed 1000: CV = 0.81433")
print("  - Ensemble: CV = 0.81698")
print("  - Mean: 0.81567, Std: 0.00072")
print("\nGap: 0.81951 - 0.81698 = 0.00253 (0.31%)")
print("\nPossible explanations:")
print("1. exp_003 used Optuna tuning - may have found a lucky configuration")
print("2. exp_003's CV estimate was optimistic (lucky fold splits)")
print("3. Something in the pipeline changed between exp_003 and exp_008")
print("4. Optuna's early stopping behavior differs from fixed iterations")

In [None]:
# Key insight: What's our TRUE baseline?
print("\n=== TRUE BASELINE ANALYSIS ===")
print("\nIf exp_003 was a lucky run, our true CV is ~0.817")
print("Based on CV-LB model: LB = 0.543 * 0.817 + 0.359 = 0.803")
print("This matches exp_006's LB of 0.8010 (CV=0.8171)")
print("\nConclusion: Our true baseline is likely ~0.817 CV, ~0.803 LB")
print("exp_003's LB of 0.8045 may have been lucky too")

print("\n=== WHAT WE KNOW ===")
print("1. CatBoost with depth=8, lr=0.051, iterations=755 gives CV ~0.816-0.817")
print("2. Multi-seed ensemble gives CV ~0.817")
print("3. exp_003's CV of 0.81951 is NOT reproducible")
print("4. To beat exp_003's LB (0.8045), we need CV > 0.82086")
print("5. We're stuck at CV ~0.817, need +0.4% improvement")

In [None]:
# What approaches haven't we tried?
print("\n=== UNEXPLORED APPROACHES ===")
print("\n1. TARGET ENCODING (HIGH PRIORITY)")
print("   - Top kernels use target encoding")
print("   - We've only used label encoding")
print("   - Could capture category-target relationships")
print("   - Must use CV-based encoding to avoid leakage")

print("\n2. 10-FOLD CV (MEDIUM PRIORITY)")
print("   - Top kernels use 10-fold CV")
print("   - More stable estimates")
print("   - May slightly improve generalization")

print("\n3. KNN IMPUTATION (MEDIUM PRIORITY)")
print("   - Top kernels mention KNN imputation")
print("   - We're using mode/median imputation")
print("   - May capture more complex relationships")

print("\n4. NEURAL NETWORK (LOW PRIORITY)")
print("   - For ensemble diversity")
print("   - TabNet or simple MLP")
print("   - Different inductive bias than GBDTs")

print("\n5. PSEUDO-LABELING (LOW PRIORITY)")
print("   - Use high-confidence predictions on test")
print("   - Retrain with pseudo-labels")
print("   - Risky but could help")

In [None]:
# Analyze what made exp_003 special
print("\n=== EXP_003 ANALYSIS ===")
print("\nexp_003 used Optuna tuning with 50 trials")
print("Best params found: depth=8, lr=0.051, iterations=755, l2_leaf_reg=3.52")
print("\nKey difference: Optuna uses early stopping during tuning")
print("This means the 'iterations=755' was found with early stopping")
print("When we re-run with fixed iterations=755, behavior may differ")

print("\nHypothesis: The Optuna run had different early stopping behavior")
print("that led to a lucky CV estimate. The true CV is ~0.817.")

print("\n=== RECOMMENDATION ===")
print("1. Accept that our true baseline is ~0.817 CV")
print("2. Try target encoding - fundamentally different approach")
print("3. If target encoding doesn't help, try 10-fold CV")
print("4. Focus on getting CV > 0.82 to beat exp_003's LB")

In [None]:
# Calculate what improvement we need
print("\n=== IMPROVEMENT NEEDED ===")
current_cv = 0.817
target_lb = 0.8045  # exp_003's LB
required_cv = (target_lb - intercept) / slope
improvement_needed = required_cv - current_cv

print(f"Current CV: {current_cv:.5f}")
print(f"Target LB: {target_lb:.5f}")
print(f"Required CV: {required_cv:.5f}")
print(f"Improvement needed: {improvement_needed:.5f} ({improvement_needed/current_cv*100:.2f}%)")

print("\nThis is a significant improvement (~0.4%)")
print("Incremental changes won't get us there")
print("Need a fundamentally different approach")

In [None]:
# Final recommendation
print("\n" + "="*60)
print("FINAL RECOMMENDATION")
print("="*60)
print("\n1. DO NOT SUBMIT exp_008 (CV=0.81698 < exp_003's CV=0.81951)")
print("   Based on CV-LB model, predicted LB ~0.802 (worse than 0.8045)")

print("\n2. TRY TARGET ENCODING NEXT")
print("   - Fundamentally different from label encoding")
print("   - Top kernels use it")
print("   - Could unlock new signal")

print("\n3. IF TARGET ENCODING FAILS, TRY:")
print("   - 10-fold CV (more stable estimates)")
print("   - KNN imputation (better missing value handling)")
print("   - Neural network for ensemble diversity")

print("\n4. REALITY CHECK:")
print("   - Target 0.9642 is IMPOSSIBLE (top LB ~0.8066)")
print("   - Our best LB 0.8045 is already top ~7%")
print("   - Gap to top: 0.0021 (0.26%)")
print("   - We're competitive, but need breakthrough for improvement")