# Loop 10 LB Feedback Analysis

## Critical Finding: exp_009 ([16]) LB is WORSE than exp_007 ([32,16])!

| Experiment | CV Score | LB Score | CV-LB Gap |
|------------|----------|----------|----------|
| exp_007 ([32,16]) | 0.009262 | 0.0932 | -0.0839 |
| exp_009 ([16]) | 0.009192 | 0.0936 | -0.0844 |

**Despite better CV, [16] has WORSE LB!**

This confirms the evaluator's concern about the increasing CV-LB ratio.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Complete submission history
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982, 'arch': '[128,128,64]'},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065, 'arch': 'LightGBM'},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972, 'arch': '[256,128,64]'},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969, 'arch': '[256,128,64] 15-bag'},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946, 'arch': '[64,32]'},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932, 'arch': '[32,16]'},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936, 'arch': '[16]'},  # NEW!
]

df = pd.DataFrame(submissions)
df['ratio'] = df['lb'] / df['cv']
df['gap'] = df['lb'] - df['cv']
print('=== Complete Submission History ===')
print(df.to_string())

In [None]:
# Analyze the CV-LB relationship with the new data point
print('\n=== CV-LB Relationship Analysis (Updated) ===')
print(f'\nCorrelation between CV and LB: {df["cv"].corr(df["lb"]):.4f}')

# Linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])
print(f'\nLinear fit: LB = {slope:.4f} * CV + {intercept:.4f}')
print(f'R-squared: {r_value**2:.4f}')

# Check if [16] is an outlier
df['predicted_lb'] = slope * df['cv'] + intercept
df['residual'] = df['lb'] - df['predicted_lb']
print('\n=== Residual Analysis ===')
print(df[['exp', 'arch', 'cv', 'lb', 'predicted_lb', 'residual']].to_string())

In [None]:
# Key insight: The [16] model has POSITIVE residual (worse than predicted)
print('\n=== KEY INSIGHT ===')
print('The [16] model has a POSITIVE residual, meaning it performed WORSE on LB than predicted.')
print('This suggests the model is overfitting to the CV structure.')

# Compare [32,16] vs [16]
print('\n=== [32,16] vs [16] Comparison ===')
print(f'[32,16]: CV {0.0093:.4f}, LB {0.0932:.4f}, Ratio {0.0932/0.0093:.2f}x')
print(f'[16]:    CV {0.0092:.4f}, LB {0.0936:.4f}, Ratio {0.0936/0.0092:.2f}x')
print(f'\nCV improvement: {(0.0093-0.0092)/0.0093*100:.2f}%')
print(f'LB degradation: {(0.0936-0.0932)/0.0932*100:.2f}%')
print('\n[16] is OVERFITTING to CV! The simpler model generalizes worse.')

In [None]:
# Plot the updated CV-LB relationship
plt.figure(figsize=(12, 6))

# Scatter plot
plt.subplot(1, 2, 1)
plt.scatter(df['cv'], df['lb'], s=100, c='blue', alpha=0.7)

# Highlight [16] as problematic
plt.scatter([0.0092], [0.0936], s=200, c='red', marker='x', linewidths=3, label='[16] - OVERFITTING')

# Add labels
for i, row in df.iterrows():
    plt.annotate(row['arch'], (row['cv'], row['lb']), textcoords='offset points', xytext=(5,5), fontsize=8)

# Add regression line
x_line = np.linspace(0.008, 0.013, 100)
y_line = slope * x_line + intercept
plt.plot(x_line, y_line, 'r--', label=f'Linear fit (RÂ²={r_value**2:.3f})')

plt.xlabel('CV Score')
plt.ylabel('LB Score')
plt.title('CV vs LB: [16] is Overfitting!')
plt.legend()
plt.grid(True, alpha=0.3)

# Ratio plot
plt.subplot(1, 2, 2)
plt.plot(range(len(df)), df['ratio'], 'bo-', markersize=8)
plt.axhline(y=df['ratio'].mean(), color='r', linestyle='--', label=f'Mean: {df["ratio"].mean():.2f}x')
for i, row in df.iterrows():
    plt.annotate(row['arch'], (i, row['ratio']), textcoords='offset points', xytext=(0,5), fontsize=8, rotation=45)
plt.xlabel('Submission Order')
plt.ylabel('LB/CV Ratio')
plt.title('CV-LB Ratio Over Time')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('/home/code/exploration/loop10_lb_feedback.png', dpi=100, bbox_inches='tight')
plt.show()
print('Plot saved.')

In [None]:
# Strategic implications
print('\n' + '='*60)
print('STRATEGIC IMPLICATIONS')
print('='*60)

print('\n1. BEST LB MODEL: exp_007 ([32,16]) with LB 0.0932')
print('   - NOT exp_009 ([16]) despite better CV')

print('\n2. OVERFITTING CONFIRMED:')
print('   - [16] has best CV (0.0092) but worse LB (0.0936)')
print('   - The simplification went TOO FAR')
print('   - [32,16] is the optimal architecture for generalization')

print('\n3. CV-LB RELATIONSHIP:')
print('   - Correlation is still strong (0.97) but not perfect')
print('   - The [16] model is an outlier - overfitting to CV structure')
print('   - Cannot blindly trust CV improvements')

print('\n4. TARGET ASSESSMENT:')
print('   - Target: 0.0333')
print('   - Best LB: 0.0932 (exp_007)')
print('   - Gap: 180% worse than target')
print('   - UNREACHABLE with current approach')

print('\n5. NEXT STEPS (No submissions remaining today):')
print('   - Focus on fundamentally different approaches')
print('   - Consider: GNN, attention mechanisms, different features')
print('   - Ensemble diverse models (not just MLP variations)')

In [None]:
# What could improve LB?
print('\n' + '='*60)
print('POTENTIAL IMPROVEMENTS FOR TOMORROW')
print('='*60)

print('\n1. ENSEMBLE DIVERSE MODELS:')
print('   - Combine [32,16] MLP + LightGBM + Ridge')
print('   - Different models capture different patterns')
print('   - May reduce variance on LB')

print('\n2. FEATURE ENGINEERING:')
print('   - Current: Spange (13) + DRFP (122) + Arrhenius (5) = 140 features')
print('   - Try: Different DRFP filtering, interaction features')
print('   - Try: Solvent-specific features (Hansen parameters, etc.)')

print('\n3. REGULARIZATION TUNING:')
print('   - [32,16] with more dropout/weight decay')
print('   - May improve generalization')

print('\n4. DIFFERENT CV SCHEME:')
print('   - Current: Leave-one-solvent-out')
print('   - Try: Stratified by solvent type')
print('   - May better match LB distribution')

print('\n5. ACCEPT LIMITATIONS:')
print('   - Target (0.0333) requires GNN/attention')
print('   - Best achievable with MLP: ~0.09 LB')
print('   - Focus on maximizing relative ranking')