# Loop 76 Analysis: CV-LB Relationship and Strategy

## Key Questions:
1. What is the exact CV-LB relationship?
2. Can we reach the target (0.0347) with current approaches?
3. What fundamentally different approaches might work?

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# All submissions with CV and LB scores
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},
    {'exp': 'exp_035', 'cv': 0.0098, 'lb': 0.0970},
    {'exp': 'exp_067', 'cv': 0.0083, 'lb': 0.0877},
]

df = pd.DataFrame(submissions)
print('Submissions with verified LB scores:')
print(df.to_string(index=False))
print(f'\nTotal verified submissions: {len(df)}')

In [None]:
# Fit linear regression: LB = slope * CV + intercept
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])

print('=== CV-LB Relationship Analysis ===')
print(f'Linear fit: LB = {slope:.4f} * CV + {intercept:.4f}')
print(f'R² = {r_value**2:.4f}')
print(f'Standard error: {std_err:.4f}')
print(f'\nInterpretation:')
print(f'  - Slope: {slope:.2f}x (CV improvement translates to {slope:.2f}x LB improvement)')
print(f'  - Intercept: {intercept:.4f} (structural gap even at CV=0)')

# Target analysis
target = 0.0347
print(f'\n=== Target Analysis ===')
print(f'Target LB: {target}')
print(f'Intercept: {intercept:.4f}')
print(f'Gap: Intercept - Target = {intercept - target:.4f}')

if intercept > target:
    print(f'\n⚠️ CRITICAL: Intercept ({intercept:.4f}) > Target ({target})')
    print(f'   Standard CV optimization CANNOT reach the target!')
    required_cv = (target - intercept) / slope
    print(f'   Required CV for target: {required_cv:.4f} (IMPOSSIBLE - negative)')
else:
    required_cv = (target - intercept) / slope
    print(f'\n✓ Target is reachable with CV = {required_cv:.4f}')

In [None]:
# Predict LB for best CV scores
best_cv = 0.0081  # Best CV from exp_049/050/053
predicted_lb = slope * best_cv + intercept
print(f'\n=== Prediction for Best CV ===')
print(f'Best CV achieved: {best_cv}')
print(f'Predicted LB: {predicted_lb:.4f}')
print(f'Target: {target}')
print(f'Gap: {predicted_lb - target:.4f}')

# What CV would we need?
print(f'\n=== What CV is needed? ===')
for target_lb in [0.0700, 0.0600, 0.0500, 0.0400, 0.0347]:
    needed_cv = (target_lb - intercept) / slope
    print(f'  For LB {target_lb:.4f}: CV = {needed_cv:.4f}')

In [None]:
# Plot the CV-LB relationship
plt.figure(figsize=(10, 6))
plt.scatter(df['cv'], df['lb'], s=100, alpha=0.7, label='Submissions')

# Fit line
cv_range = np.linspace(0, 0.015, 100)
lb_fit = slope * cv_range + intercept
plt.plot(cv_range, lb_fit, 'r--', label=f'Fit: LB = {slope:.2f}*CV + {intercept:.4f}')

# Target line
plt.axhline(y=target, color='g', linestyle=':', label=f'Target: {target}')

# Intercept
plt.axhline(y=intercept, color='orange', linestyle=':', alpha=0.5, label=f'Intercept: {intercept:.4f}')

plt.xlabel('CV Score (MSE)')
plt.ylabel('LB Score (MSE)')
plt.title('CV vs LB Relationship')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('/home/code/exploration/cv_lb_relationship.png', dpi=100, bbox_inches='tight')
plt.show()
print('\nPlot saved to /home/code/exploration/cv_lb_relationship.png')

In [None]:
# Analyze residuals - are there any outliers that break the pattern?
df['predicted_lb'] = slope * df['cv'] + intercept
df['residual'] = df['lb'] - df['predicted_lb']
df['residual_pct'] = df['residual'] / df['lb'] * 100

print('=== Residual Analysis ===')
print(df[['exp', 'cv', 'lb', 'predicted_lb', 'residual', 'residual_pct']].to_string(index=False))
print(f'\nMean residual: {df["residual"].mean():.4f}')
print(f'Std residual: {df["residual"].std():.4f}')
print(f'\nAny outliers (|residual| > 2*std)?')
outliers = df[abs(df['residual']) > 2 * df['residual'].std()]
if len(outliers) > 0:
    print(outliers[['exp', 'cv', 'lb', 'residual']])
else:
    print('  No significant outliers - all approaches follow the same CV-LB line')

In [None]:
# Key insight: The intercept problem
print('=== KEY INSIGHT: THE INTERCEPT PROBLEM ===')
print(f'''
The CV-LB relationship is:
  LB = {slope:.2f} * CV + {intercept:.4f}

The intercept ({intercept:.4f}) represents the STRUCTURAL DISTRIBUTION SHIFT
between training and test data. This is the error that exists even if we had
a perfect model (CV=0).

The target is {target}, which is BELOW the intercept ({intercept:.4f}).

This means:
1. Standard CV optimization CANNOT reach the target
2. All model types (MLP, LGBM, XGB, GP) fall on the same line
3. The intercept represents EXTRAPOLATION ERROR to unseen solvents

To reach the target, we MUST:
1. Find approaches that CHANGE the CV-LB relationship (reduce intercept)
2. Or find approaches that break the linear pattern

Possible strategies:
1. Extrapolation detection + conservative predictions
2. Uncertainty-weighted predictions (GP variance)
3. Chemical class-specific models
4. Physics-informed constraints
5. Study what top kernels do differently
''')

print('\n=== WHAT WOULD CHANGE THE INTERCEPT? ===')
print('''
1. EXTRAPOLATION DETECTION:
   - Detect when test solvent is "far" from training distribution
   - Blend predictions toward population mean for outliers
   - This reduces error on extreme solvents (HFIP, Cyclohexane, Water)

2. UNCERTAINTY-WEIGHTED PREDICTIONS:
   - Use GP variance as uncertainty estimate
   - High uncertainty → conservative prediction (closer to mean)
   - This naturally handles extrapolation

3. CHEMICAL CLASS-SPECIFIC MODELS:
   - Train separate models for alcohols, ethers, polar aprotic, etc.
   - Models generalize better within chemical families
   - Reduces extrapolation error for solvents in known families

4. PHYSICS-INFORMED CONSTRAINTS:
   - Ensure predictions respect physical constraints
   - E.g., yields should sum to ~1 (probability normalization)
   - Arrhenius kinetics constraints

5. STUDY TOP KERNELS:
   - "best-work-here" normalizes predictions to probabilities
   - "mixall" uses GroupKFold(5) instead of Leave-One-Out
   - These might have different CV-LB characteristics
''')