# Loop 91 Analysis: Understanding the CV-LB Gap

**Critical Problem**: The CV-LB relationship is LB = 4.29 × CV + 0.0528 (R² = 0.95)

The intercept (0.0528) is ABOVE the target (0.0347), meaning no amount of CV improvement can reach the target.

**Key Questions**:
1. What makes test solvents different from training solvents?
2. Can we identify which solvents are "harder" to predict?
3. What approaches might change the CV-LB relationship?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# Load submission history
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},
    {'exp': 'exp_035', 'cv': 0.0098, 'lb': 0.0970},
    {'exp': 'exp_073', 'cv': 0.0084, 'lb': 0.1451},  # Outlier!
]

df = pd.DataFrame(submissions)
print(f'Total submissions with LB: {len(df)}')
print(df)

In [None]:
# Analyze CV-LB relationship (excluding outlier exp_073)
df_clean = df[df['lb'] < 0.12]  # Exclude exp_073 outlier

slope, intercept, r_value, p_value, std_err = stats.linregress(df_clean['cv'], df_clean['lb'])

print(f'\n=== CV-LB Relationship Analysis ===')
print(f'Linear fit: LB = {slope:.4f} × CV + {intercept:.4f}')
print(f'R² = {r_value**2:.4f}')
print(f'\nIntercept: {intercept:.4f}')
print(f'Target LB: 0.0347')
print(f'\nCRITICAL: Intercept ({intercept:.4f}) > Target (0.0347)')
print(f'This means even with CV=0, expected LB would be {intercept:.4f}')

# What CV would be needed to hit target?
required_cv = (0.0347 - intercept) / slope
print(f'\nRequired CV to hit target: {required_cv:.6f}')
if required_cv < 0:
    print('IMPOSSIBLE: Would require negative CV!')

In [None]:
# Plot CV vs LB
plt.figure(figsize=(10, 6))

# Plot all points
plt.scatter(df_clean['cv'], df_clean['lb'], s=100, alpha=0.7, label='Submissions')

# Plot outlier
outlier = df[df['lb'] >= 0.12]
if len(outlier) > 0:
    plt.scatter(outlier['cv'], outlier['lb'], s=100, c='red', marker='x', label='Outlier (exp_073)')

# Plot regression line
cv_range = np.linspace(0, 0.015, 100)
lb_pred = slope * cv_range + intercept
plt.plot(cv_range, lb_pred, 'b--', label=f'LB = {slope:.2f}×CV + {intercept:.4f}')

# Plot target
plt.axhline(y=0.0347, color='g', linestyle=':', linewidth=2, label='Target LB = 0.0347')

# Plot intercept
plt.axhline(y=intercept, color='r', linestyle=':', linewidth=2, label=f'Intercept = {intercept:.4f}')

plt.xlabel('CV Score')
plt.ylabel('LB Score')
plt.title('CV vs LB Relationship - The Intercept Problem')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('/home/code/exploration/cv_lb_analysis_loop91.png', dpi=150)
plt.show()

print('\nThe gap between intercept and target is the STRUCTURAL DISTRIBUTION SHIFT')

In [None]:
# Load data to understand solvent characteristics
DATA_PATH = '/home/data'

df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

print('=== Single Solvent Data ===')
print(f'Samples: {len(df_single)}')
print(f'Unique solvents: {df_single["SOLVENT NAME"].nunique()}')
print(f'\nSolvents: {sorted(df_single["SOLVENT NAME"].unique())}')

print('\n=== Full Data (Mixtures) ===')
print(f'Samples: {len(df_full)}')
print(f'Unique solvent A: {df_full["SOLVENT A NAME"].nunique()}')
print(f'Unique solvent B: {df_full["SOLVENT B NAME"].nunique()}')
print(f'\nSolvent A: {sorted(df_full["SOLVENT A NAME"].unique())}')
print(f'Solvent B: {sorted(df_full["SOLVENT B NAME"].unique())}')

In [None]:
# Analyze per-solvent prediction difficulty
# Load spange descriptors to understand solvent properties
spange = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)
print('=== Spange Descriptors ===')
print(f'Shape: {spange.shape}')
print(f'\nSolvents in spange: {sorted(spange.index.tolist())}')

# Check which solvents are in training vs might be in test
single_solvents = set(df_single['SOLVENT NAME'].unique())
full_solvents_a = set(df_full['SOLVENT A NAME'].unique())
full_solvents_b = set(df_full['SOLVENT B NAME'].unique())
all_solvents = single_solvents | full_solvents_a | full_solvents_b
spange_solvents = set(spange.index.tolist())

print(f'\nSolvents in data: {len(all_solvents)}')
print(f'Solvents in spange: {len(spange_solvents)}')
print(f'Solvents in spange but not in data: {spange_solvents - all_solvents}')
print(f'Solvents in data but not in spange: {all_solvents - spange_solvents}')

In [None]:
# Analyze yield distribution per solvent
print('=== Per-Solvent Yield Statistics ===')

solvent_stats = df_single.groupby('SOLVENT NAME').agg({
    'Product 2': ['mean', 'std'],
    'Product 3': ['mean', 'std'],
    'SM': ['mean', 'std']
}).round(4)

solvent_stats.columns = ['P2_mean', 'P2_std', 'P3_mean', 'P3_std', 'SM_mean', 'SM_std']

# Calculate total yield variability
solvent_stats['total_std'] = np.sqrt(solvent_stats['P2_std']**2 + solvent_stats['P3_std']**2 + solvent_stats['SM_std']**2)

print(solvent_stats.sort_values('total_std', ascending=False))

print('\n=== Solvents with Highest Variability (Hardest to Predict) ===')
print(solvent_stats.nlargest(5, 'total_std')[['P2_mean', 'P3_mean', 'SM_mean', 'total_std']])

In [None]:
# Key insight: The benchmark paper achieved MSE 0.0039
# Our best CV is 0.0081, best LB is 0.0877
# The gap is HUGE

print('=== Performance Gap Analysis ===')
print(f'Benchmark paper MSE: 0.0039')
print(f'Our best CV: 0.0081 (2.1x worse)')
print(f'Our best LB: 0.0877 (22.5x worse than benchmark!)')
print(f'Target LB: 0.0347 (8.9x worse than benchmark)')

print('\n=== What the benchmark paper likely did differently ===')
print('1. Pre-training on large molecular datasets')
print('2. Graph Neural Networks with attention mechanisms')
print('3. Domain-specific constraints that generalize')
print('4. Different validation strategy (not leave-one-out)')

print('\n=== What we MUST try ===')
print('1. Pre-trained molecular embeddings (ChemBERTa, MolBERT)')
print('2. GNN with proper message passing')
print('3. Extrapolation detection + conservative predictions')
print('4. Pseudo-labeling with confident predictions')

In [None]:
# Analyze the mixall kernel approach
# It uses GroupKFold instead of Leave-One-Out
# This might explain why it has good CV/LB correlation

print('=== Key Insight from mixall Kernel ===')
print('The mixall kernel uses GroupKFold (5-fold) instead of Leave-One-Out')
print('This means each fold has ~20% of solvents held out')
print('\nThis is DIFFERENT from the official evaluation which uses Leave-One-Out')
print('\nHowever, the kernel claims "good CV/LB" correlation')
print('\nPossible explanation:')
print('- GroupKFold with 5 folds is more robust than LOO')
print('- LOO has high variance due to single-solvent test sets')
print('- The official evaluation might use a different split')

print('\n=== Approaches to try ===')
print('1. Ensemble of MLP + XGBoost + RF + LightGBM (from mixall)')
print('2. Use GroupKFold for hyperparameter tuning, LOO for final CV')
print('3. Focus on reducing variance, not just bias')

In [None]:
# Summary of findings
print('='*70)
print('LOOP 91 ANALYSIS SUMMARY')
print('='*70)

print('\n1. CV-LB RELATIONSHIP:')
print(f'   LB = {slope:.2f} × CV + {intercept:.4f} (R² = {r_value**2:.4f})')
print(f'   Intercept ({intercept:.4f}) > Target (0.0347)')
print(f'   IMPOSSIBLE to reach target by improving CV alone!')

print('\n2. DISTRIBUTION SHIFT:')
print('   - Test solvents are systematically "harder" than training')
print('   - The intercept represents structural extrapolation error')
print('   - All tabular models fall on the SAME CV-LB line')

print('\n3. WHAT MUST CHANGE:')
print('   - Need approaches that REDUCE THE INTERCEPT')
print('   - Not just improve CV (which moves along the line)')
print('   - Focus on extrapolation detection and conservative predictions')

print('\n4. PROMISING APPROACHES:')
print('   a) Ensemble from mixall kernel (MLP+XGB+RF+LGBM)')
print('   b) Extrapolation detection with conservative blending')
print('   c) Pre-trained molecular embeddings')
print('   d) Domain constraints (yields must be non-negative, sum ≤ 1)')

print('\n5. REMAINING SUBMISSIONS: 4')
print('   - Must be strategic about what to submit')
print('   - Only submit if approach is fundamentally different')
print('='*70)