# Strategic Analysis - Loop 11

## Key Decision: Should we submit exp_011?

exp_011 achieved CV 0.82032 - the BEST CV ever. But the evaluator notes:
- High fold variance (0.01408 std vs 0.00685 in exp_003)
- Fold range is 4.4% (0.79862 - 0.84253)
- Predicted LB ~0.8042 (slightly below best 0.8045)

Let's analyze the CV-LB relationship to make an informed decision.

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

# Historical CV-LB data
submissions = [
    {'exp': 'exp_000', 'cv': 0.8067, 'lb': 0.7971},
    {'exp': 'exp_003', 'cv': 0.8195, 'lb': 0.8045},
    {'exp': 'exp_004', 'cv': 0.8193, 'lb': 0.8041},
    {'exp': 'exp_006', 'cv': 0.8171, 'lb': 0.8010},
]

df = pd.DataFrame(submissions)
print("Historical CV-LB relationship:")
print(df)
print(f"\nCorrelation: {df['cv'].corr(df['lb']):.4f}")

# Linear regression to predict LB from CV
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])
print(f"\nLinear model: LB = {slope:.4f} * CV + {intercept:.4f}")
print(f"RÂ² = {r_value**2:.4f}")

# Predict LB for exp_011
exp_011_cv = 0.82032
predicted_lb = slope * exp_011_cv + intercept
print(f"\nexp_011 CV: {exp_011_cv:.5f}")
print(f"Predicted LB: {predicted_lb:.5f}")
print(f"Best LB so far: 0.8045")
print(f"Difference: {predicted_lb - 0.8045:.5f}")

In [None]:
# Analyze the CV-LB gap pattern
df['gap'] = df['cv'] - df['lb']
df['gap_pct'] = df['gap'] / df['cv'] * 100

print("CV-LB Gap Analysis:")
print(df[['exp', 'cv', 'lb', 'gap', 'gap_pct']])
print(f"\nMean gap: {df['gap'].mean():.4f} ({df['gap_pct'].mean():.2f}%)")
print(f"Gap std: {df['gap'].std():.4f}")

# If exp_011 follows the same gap pattern
exp_011_cv = 0.82032
mean_gap = df['gap'].mean()
predicted_lb_gap = exp_011_cv - mean_gap
print(f"\nexp_011 CV: {exp_011_cv:.5f}")
print(f"Predicted LB (using mean gap): {predicted_lb_gap:.5f}")

In [None]:
# Key question: Is exp_011's CV improvement real or due to fold variance?

# exp_003 CV: 0.81951 (+/- 0.00685) - 5-fold
# exp_011 CV: 0.82032 (+/- 0.01408) - 10-fold

# The improvement is 0.00081 (0.08%)
# But exp_011's std is 0.01408 - much higher than the improvement

improvement = 0.82032 - 0.81951
exp_011_std = 0.01408
exp_003_std = 0.00685

print("CV Improvement Analysis:")
print(f"exp_003 CV: 0.81951 (+/- {exp_003_std:.5f})")
print(f"exp_011 CV: 0.82032 (+/- {exp_011_std:.5f})")
print(f"Improvement: {improvement:.5f} ({improvement/0.81951*100:.2f}%)")
print(f"\nIs improvement significant?")
print(f"Improvement ({improvement:.5f}) vs exp_011 std ({exp_011_std:.5f})")
print(f"Improvement is {improvement/exp_011_std:.2f}x the std - NOT statistically significant")

# But the key insight is that regularization IMPROVED CV, not decreased it
print("\n=== KEY INSIGHT ===")
print("Regularization IMPROVED CV (0.82032 vs 0.81951)")
print("This suggests we were UNDERFITTING, not overfitting!")
print("The regularization may help with generalization even if CV improvement is small.")

In [None]:
# Decision matrix for submission
print("=== SUBMISSION DECISION MATRIX ===")
print("\nPros of submitting exp_011:")
print("1. Best CV ever (0.82032)")
print("2. Regularization may improve generalization")
print("3. 6 submissions remaining - worth testing")
print("4. Need to verify CV-LB relationship with regularized model")

print("\nCons of submitting exp_011:")
print("1. High fold variance (0.01408) - CV may be inflated")
print("2. Predicted LB (~0.8042) is below best (0.8045)")
print("3. CV improvement is not statistically significant")

print("\n=== RECOMMENDATION ===")
print("SUBMIT exp_011")
print("Reason: Even if CV improvement is within noise, the regularization")
print("approach is fundamentally different and may generalize better.")
print("We need LB feedback to calibrate our CV-LB model for regularized models.")

In [None]:
# What to try next after submission
print("=== NEXT EXPERIMENTS ===")
print("\n1. GroupKFold (Evaluator's top recommendation)")
print("   - May reduce fold variance by respecting group structure")
print("   - 77.3% are solo travelers, so effect may be limited")
print("   - Worth trying to get more stable CV estimates")

print("\n2. KNN Imputation (Top solution technique)")
print("   - Different data preprocessing approach")
print("   - May capture relationships between features")
print("   - Could provide orthogonal improvement")

print("\n3. Ensemble of regularized + non-regularized models")
print("   - Combine exp_003 (depth=8, less regularization) with exp_011 (depth=6, more regularization)")
print("   - Different regularization levels may capture different patterns")

print("\n=== REALITY CHECK ===")
print("Target 0.9642 is IMPOSSIBLE. Top LB is ~0.8066.")
print("Our best (0.8045) is already top ~7%.")
print("Focus on incremental improvements toward 0.81 LB, not 0.96.")