# Loop 64 Analysis: Understanding the CV-LB Gap

## Key Insight
The CV-LB relationship is: LB = 4.21 × CV + 0.0535 (R² = 0.98)

The intercept (0.0535) is HIGHER than the target (0.0347). This means:
- Even with CV = 0, the predicted LB would be 0.0535 > target
- The current approach CANNOT reach the target by minimizing CV alone

## Questions to Investigate
1. What is the distribution of errors across different solvents?
2. Are there specific solvents that contribute more to the error?
3. Is there a pattern in which solvents are harder to predict?
4. Can we identify features that correlate with prediction difficulty?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern, WhiteKernel, ConstantKernel
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = '/home/data'

# Load data
df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')
spange_df = pd.read_csv(f'{DATA_PATH}/spange_descriptors_lookup.csv', index_col=0)

print(f'Single solvent data: {df_single.shape}')
print(f'Full data: {df_full.shape}')
print(f'Spange descriptors: {spange_df.shape}')
print(f'\nSolvents in single: {df_single["SOLVENT NAME"].nunique()}')
print(f'Unique solvents: {sorted(df_single["SOLVENT NAME"].unique())}')

In [None]:
# Analyze the distribution of target values
print('=== Target Distribution ===')
for col in ['Product 2', 'Product 3', 'SM']:
    print(f'{col}: mean={df_single[col].mean():.4f}, std={df_single[col].std():.4f}, min={df_single[col].min():.4f}, max={df_single[col].max():.4f}')

print('\n=== Per-Solvent Target Means ===')
solvent_means = df_single.groupby('SOLVENT NAME')[['Product 2', 'Product 3', 'SM']].mean()
print(solvent_means.round(4))

In [None]:
# Analyze the variance of targets per solvent
print('=== Per-Solvent Target Variance ===')
solvent_vars = df_single.groupby('SOLVENT NAME')[['Product 2', 'Product 3', 'SM']].var()
print(solvent_vars.round(6))

print('\n=== Solvents with Highest Variance ===')
for col in ['Product 2', 'Product 3', 'SM']:
    top_var = solvent_vars[col].nlargest(5)
    print(f'{col}: {list(top_var.index)}')

In [None]:
# Analyze Spange descriptors
print('=== Spange Descriptors ===')
print(spange_df.describe().round(4))

print('\n=== Descriptor Correlations with Target Means ===')
solvent_means_with_spange = solvent_means.join(spange_df)
for col in ['Product 2', 'Product 3', 'SM']:
    correlations = solvent_means_with_spange.corr()[col].drop(['Product 2', 'Product 3', 'SM'])
    top_corr = correlations.abs().nlargest(5)
    print(f'\n{col}:')
    for desc in top_corr.index:
        print(f'  {desc}: {correlations[desc]:.4f}')

In [None]:
# Analyze the CV-LB relationship more carefully
# Load submission history
import json
with open('/home/code/session_state.json') as f:
    state = json.load(f)

submissions = state.get('submissions', [])
cvs = [s['cv_score'] for s in submissions if s.get('cv_score') and s.get('lb_score')]
lbs = [s['lb_score'] for s in submissions if s.get('cv_score') and s.get('lb_score')]
exp_ids = [s['experiment_id'] for s in submissions if s.get('cv_score') and s.get('lb_score')]

print('=== CV-LB Relationship ===')
for exp_id, cv, lb in zip(exp_ids, cvs, lbs):
    ratio = lb / cv if cv > 0 else 0
    print(f'{exp_id}: CV={cv:.6f}, LB={lb:.5f}, LB/CV={ratio:.2f}')

# Linear regression
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(cvs, lbs)
print(f'\nLinear fit: LB = {slope:.2f} * CV + {intercept:.4f}')
print(f'R² = {r_value**2:.4f}')
print(f'Intercept = {intercept:.4f}')
print(f'Target = 0.0347')
print(f'Gap = {intercept - 0.0347:.4f}')

In [None]:
# Plot CV vs LB
plt.figure(figsize=(10, 6))
plt.scatter(cvs, lbs, s=100, alpha=0.7)
for i, exp_id in enumerate(exp_ids):
    plt.annotate(exp_id, (cvs[i], lbs[i]), fontsize=8)

# Fit line
x_line = np.linspace(min(cvs), max(cvs), 100)
y_line = slope * x_line + intercept
plt.plot(x_line, y_line, 'r--', label=f'LB = {slope:.2f}*CV + {intercept:.4f}')

# Target line
plt.axhline(y=0.0347, color='g', linestyle=':', label='Target (0.0347)')

# Intercept line
plt.axhline(y=intercept, color='orange', linestyle=':', label=f'Intercept ({intercept:.4f})')

plt.xlabel('CV Score')
plt.ylabel('LB Score')
plt.title('CV vs LB Relationship')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('/home/code/exploration/cv_lb_relationship.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved to /home/code/exploration/cv_lb_relationship.png')

In [None]:
# Analyze what the intercept means
print('=== Intercept Analysis ===')
print(f'Intercept: {intercept:.4f}')
print(f'Target: 0.0347')
print(f'Gap: {intercept - 0.0347:.4f}')
print()
print('The intercept represents the "baseline" LB error that exists even with perfect CV.')
print('This suggests there is a SYSTEMATIC BIAS in how the model generalizes to the test set.')
print()
print('Possible causes:')
print('1. The test set contains solvents that are fundamentally different from training solvents')
print('2. The model is overconfident on OOD samples')
print('3. There is a distribution shift between CV and LB evaluation')
print()
print('To reach the target (0.0347), we need to either:')
print('1. Find an approach that reduces the intercept (changes the CV-LB relationship)')
print('2. Find an approach with a different CV-LB relationship')

In [None]:
# Analyze the residuals from the CV-LB fit
residuals = np.array(lbs) - (slope * np.array(cvs) + intercept)
print('=== Residuals from CV-LB Fit ===')
for exp_id, cv, lb, res in zip(exp_ids, cvs, lbs, residuals):
    print(f'{exp_id}: CV={cv:.6f}, LB={lb:.5f}, Residual={res:+.5f}')

print(f'\nMean residual: {residuals.mean():.6f}')
print(f'Std residual: {residuals.std():.6f}')
print(f'Max positive residual: {residuals.max():.6f} ({exp_ids[np.argmax(residuals)]})')
print(f'Max negative residual: {residuals.min():.6f} ({exp_ids[np.argmin(residuals)]})')

In [None]:
# Check if there are any experiments with significantly different CV-LB relationship
print('=== Experiments with Unusual CV-LB Relationship ===')
for exp_id, cv, lb, res in zip(exp_ids, cvs, lbs, residuals):
    if abs(res) > 0.005:  # More than 0.5% deviation
        print(f'{exp_id}: CV={cv:.6f}, LB={lb:.5f}, Residual={res:+.5f} (UNUSUAL)')

print('\nNote: Experiments with negative residuals performed BETTER on LB than expected from CV.')
print('These might have approaches that generalize better to OOD data.')

In [None]:
# Summary and recommendations
print('=== SUMMARY ===')
print(f'Best CV: {min(cvs):.6f} ({exp_ids[np.argmin(cvs)]})')
print(f'Best LB: {min(lbs):.5f} ({exp_ids[np.argmin(lbs)]})')
print(f'CV-LB relationship: LB = {slope:.2f} * CV + {intercept:.4f}')
print(f'Intercept: {intercept:.4f} > Target: 0.0347')
print()
print('=== KEY INSIGHT ===')
print('The intercept (0.0535) is HIGHER than the target (0.0347).')
print('This means the current approach CANNOT reach the target by minimizing CV alone.')
print('We need an approach that CHANGES the CV-LB relationship itself.')
print()
print('=== RECOMMENDATIONS ===')
print('1. Try approaches that specifically improve OOD generalization')
print('2. Try approaches that reduce model confidence on uncertain predictions')
print('3. Try approaches that leverage domain knowledge (solvent similarity, mass balance)')
print('4. Consider that the test set might have fundamentally different solvents')