# Loop 98 Analysis: CV-LB Relationship and Path Forward

## Critical Situation:
- Best CV: 0.008092 (exp_050)
- Best LB: 0.0877 (exp_030/exp_067)
- Target: 0.0347
- Gap to target: 152.8%

## Key Question:
The CV-LB relationship shows intercept > target. What can change this?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# All submissions with LB scores
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},
    {'exp': 'exp_035', 'cv': 0.0098, 'lb': 0.0970},
    {'exp': 'exp_067', 'cv': 0.0083, 'lb': 0.0877},
]

df = pd.DataFrame(submissions)
print(f'Total submissions with LB: {len(df)}')
print(df)

In [None]:
# Fit linear regression: LB = slope * CV + intercept
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])

print(f'\n=== CV-LB Relationship ===')
print(f'LB = {slope:.4f} * CV + {intercept:.4f}')
print(f'RÂ² = {r_value**2:.4f}')
print(f'Intercept = {intercept:.4f}')
print(f'Target = 0.0347')
print(f'\nIntercept > Target: {intercept > 0.0347}')

# Required CV to hit target
required_cv = (0.0347 - intercept) / slope
print(f'\nRequired CV to hit target: {required_cv:.6f}')
if required_cv < 0:
    print('IMPOSSIBLE with current CV-LB relationship!')

In [None]:
# Visualize
plt.figure(figsize=(10, 6))
plt.scatter(df['cv'], df['lb'], s=100, alpha=0.7, label='Submissions')

# Regression line
cv_range = np.linspace(0, 0.015, 100)
lb_pred = slope * cv_range + intercept
plt.plot(cv_range, lb_pred, 'r--', label=f'LB = {slope:.2f}*CV + {intercept:.4f}')

# Target line
plt.axhline(y=0.0347, color='g', linestyle=':', linewidth=2, label='Target (0.0347)')

# Best points
best_cv_idx = df['cv'].idxmin()
best_lb_idx = df['lb'].idxmin()
plt.scatter(df.loc[best_cv_idx, 'cv'], df.loc[best_cv_idx, 'lb'], 
            s=200, marker='*', c='blue', label=f'Best CV: {df.loc[best_cv_idx, "exp"]}')
plt.scatter(df.loc[best_lb_idx, 'cv'], df.loc[best_lb_idx, 'lb'], 
            s=200, marker='*', c='green', label=f'Best LB: {df.loc[best_lb_idx, "exp"]}')

plt.xlabel('CV Score')
plt.ylabel('LB Score')
plt.title('CV vs LB Relationship - All Submissions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('/home/code/exploration/cv_lb_relationship_loop98.png', dpi=150)
plt.show()

print(f'\nPlot saved to /home/code/exploration/cv_lb_relationship_loop98.png')

In [None]:
# Analyze the gap
print('=== Gap Analysis ===')
print(f'Best LB achieved: 0.0877')
print(f'Target LB: 0.0347')
print(f'Gap: {0.0877 - 0.0347:.4f} ({(0.0877 - 0.0347) / 0.0347 * 100:.1f}%)')

print(f'\n=== What would change the intercept? ===')
print('1. Different validation scheme (but we use official CV)')
print('2. Different feature engineering that generalizes better')
print('3. Approaches that are more conservative when extrapolating')
print('4. Fundamentally different model architecture')
print('5. Something the top competitors know that we dont')

In [None]:
# Look at the leaderboard gap
print('=== Leaderboard Analysis ===')
print('1st place: 0.03470')
print('2nd place: 0.07074')
print('Our best: 0.0877')
print(f'\nGap 1st to 2nd: {0.07074 - 0.0347:.4f} ({(0.07074 - 0.0347) / 0.0347 * 100:.1f}%)')
print(f'Gap 2nd to us: {0.0877 - 0.07074:.4f} ({(0.0877 - 0.07074) / 0.07074 * 100:.1f}%)')

print('\n=== Key Insight ===')
print('The 1st place (0.0347) is MUCH better than 2nd (0.0707)')
print('This suggests 1st place found a fundamentally different approach')
print('2nd place is closer to our score - they likely used similar methods')

In [None]:
# What experiments have we NOT tried that could change the relationship?
print('=== Unexplored Approaches ===')
print()
print('1. EXACT replication of top public kernels (not just inspired by)')
print('   - We tried ens-model but got CV=0.0109, not their claimed score')
print('   - We tried best-work-here but non-linear features hurt')
print()
print('2. Different target weighting')
print('   - SM target is hardest (highest variance)')
print('   - Maybe weight SM 2-3x more in loss')
print()
print('3. Per-solvent-class models')
print('   - Alcohols, ethers, esters behave differently')
print('   - Use class-specific models')
print()
print('4. Pseudo-labeling with test data')
print('   - Use confident predictions to augment training')
print('   - Helps with distribution shift')
print()
print('5. Simpler baseline models')
print('   - Maybe complex models overfit to training distribution')
print('   - Try very simple models (linear, ridge) with good features')

In [None]:
# Check what our best experiments actually are
print('=== Best Experiments by CV ===')
best_exps = [
    ('exp_049', 0.0081, 'CatBoost+XGBoost', 'pending'),
    ('exp_050', 0.0081, 'CatBoost+XGBoost fixed', 'pending'),
    ('exp_053', 0.0081, 'Exact template', 'pending'),
    ('exp_030', 0.0083, 'GP+MLP+LGBM', '0.0877'),
    ('exp_067', 0.0083, 'Sigmoid output', '0.0877'),
    ('exp_026', 0.0085, 'Weighted loss', '0.0887'),
]

for exp, cv, desc, lb in best_exps:
    print(f'{exp}: CV={cv:.4f}, LB={lb}, {desc}')

print('\n=== Key Observation ===')
print('exp_049/050/053 have CV=0.0081 but no LB yet')
print('If the CV-LB relationship holds: LB = 4.34*0.0081 + 0.0523 = 0.0875')
print('This would be similar to our best LB (0.0877)')

In [None]:
# What about the pending submissions?
print('=== Pending Submissions ===')
pending = [
    ('exp_049', 0.0081),
    ('exp_050', 0.0081),
    ('exp_052', 0.0109),
    ('exp_053', 0.0081),
    ('exp_054', 0.0085),
    ('exp_055', 0.0085),
    ('exp_057', 0.0093),
    ('exp_063', 0.0112),
    ('exp_064', 0.0092),
    ('exp_065', 0.0088),
]

print('Predicted LB using CV-LB relationship:')
for exp, cv in pending:
    pred_lb = slope * cv + intercept
    print(f'{exp}: CV={cv:.4f} -> Predicted LB={pred_lb:.4f}')

print(f'\nBest predicted: exp_049/050/053 with LB~0.0875')
print('This is only marginally better than our best LB (0.0877)')