# Loop 69 Analysis: CV-LB Relationship and Strategy Assessment

## Current Status
- 21 submissions made, 5 remaining
- Best LB: 0.0877 (exp_030)
- Best CV: 0.0081 (exp_049, exp_050, exp_053)
- Target: 0.0347
- Gap to target: 0.0530 (60.8% reduction needed)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# All submissions with LB scores
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},  # Best LB
]

df = pd.DataFrame(submissions)
print(f'Submissions with LB scores: {len(df)}')
print(df)

In [None]:
# Fit linear regression: LB = slope * CV + intercept
X = df['cv'].values.reshape(-1, 1)
y = df['lb'].values

reg = LinearRegression()
reg.fit(X, y)

slope = reg.coef_[0]
intercept = reg.intercept_
r2 = reg.score(X, y)

print(f'\nCV-LB Linear Relationship:')
print(f'LB = {slope:.4f} * CV + {intercept:.4f}')
print(f'R² = {r2:.4f}')
print(f'\nIntercept = {intercept:.4f}')
print(f'Target = 0.0347')
print(f'\nCRITICAL: Intercept ({intercept:.4f}) > Target (0.0347)')
print(f'Even with CV=0, expected LB would be {intercept:.4f}')

In [None]:
# Calculate required CV to hit target
target = 0.0347
required_cv = (target - intercept) / slope
print(f'\nRequired CV to hit target: {required_cv:.6f}')
if required_cv < 0:
    print('IMPOSSIBLE: Required CV is negative!')
else:
    print(f'Need to reduce CV from {df["cv"].min():.4f} to {required_cv:.4f}')

In [None]:
# Visualize CV-LB relationship
plt.figure(figsize=(10, 6))
plt.scatter(df['cv'], df['lb'], s=100, alpha=0.7, label='Submissions')

# Plot regression line
cv_range = np.linspace(0, 0.015, 100)
lb_pred = slope * cv_range + intercept
plt.plot(cv_range, lb_pred, 'r--', label=f'LB = {slope:.2f}*CV + {intercept:.4f} (R²={r2:.3f})')

# Mark target
plt.axhline(y=target, color='g', linestyle=':', linewidth=2, label=f'Target: {target}')

# Mark best LB
best_idx = df['lb'].idxmin()
plt.scatter(df.loc[best_idx, 'cv'], df.loc[best_idx, 'lb'], s=200, c='red', marker='*', 
            label=f'Best LB: {df.loc[best_idx, "lb"]:.4f}')

plt.xlabel('CV Score (MSE)')
plt.ylabel('LB Score (MSE)')
plt.title('CV vs LB Relationship - All 11 Submissions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('/home/code/exploration/cv_lb_relationship_loop69.png', dpi=150)
plt.show()

print(f'\nGap Analysis:')
print(f'Best LB: {df["lb"].min():.4f}')
print(f'Target: {target}')
print(f'Gap: {df["lb"].min() - target:.4f}')
print(f'Reduction needed: {(df["lb"].min() - target) / df["lb"].min() * 100:.1f}%')

In [None]:
# Analyze pending submissions
pending = [
    {'exp': 'exp_049', 'cv': 0.0081},
    {'exp': 'exp_050', 'cv': 0.0081},
    {'exp': 'exp_052', 'cv': 0.0109},
    {'exp': 'exp_053', 'cv': 0.0081},
    {'exp': 'exp_054', 'cv': 0.0085},
    {'exp': 'exp_055', 'cv': 0.0085},
    {'exp': 'exp_057', 'cv': 0.0093},
    {'exp': 'exp_063', 'cv': 0.0112},
    {'exp': 'exp_064', 'cv': 0.0092},
]

print('\nPending Submissions - Predicted LB:')
for p in pending:
    pred_lb = slope * p['cv'] + intercept
    print(f"{p['exp']}: CV={p['cv']:.4f} -> Predicted LB={pred_lb:.4f}")

print(f'\nBest pending CV: 0.0081')
print(f'Predicted LB for CV=0.0081: {slope * 0.0081 + intercept:.4f}')

In [None]:
# Current experiment (exp_065)
exp_065_cv = 0.008702
pred_lb_065 = slope * exp_065_cv + intercept
print(f'\nExp 065 (Clean Submission):')
print(f'CV: {exp_065_cv:.6f}')
print(f'Predicted LB: {pred_lb_065:.4f}')
print(f'\nThis is similar to exp_030 (CV=0.0083, LB=0.0877)')

In [None]:
# Key insight: The intercept problem
print('='*60)
print('CRITICAL INSIGHT: THE INTERCEPT PROBLEM')
print('='*60)
print(f'\nThe CV-LB relationship has an intercept of {intercept:.4f}')
print(f'This means even with perfect CV=0, expected LB = {intercept:.4f}')
print(f'\nThe target is {target}, which is BELOW the intercept!')
print(f'\nThis suggests STRUCTURAL DISTRIBUTION SHIFT:')
print('- Test solvents are fundamentally different from training solvents')
print('- Standard ML approaches cannot fix this extrapolation error')
print('- Need distribution-shift-aware strategies')
print('\nPossible approaches to REDUCE THE INTERCEPT:')
print('1. Extrapolation detection + conservative predictions')
print('2. Uncertainty-weighted predictions (GP variance)')
print('3. Physics-informed constraints that generalize')
print('4. Pseudo-labeling with test data')
print('5. Study what top public kernels do differently')