# Loop 63 Analysis: Critical Assessment

## Key Questions:
1. What is the true CV-LB relationship?
2. What approaches haven't been tried?
3. Is there a fundamentally different approach that could work?

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Submission data
submissions = [
    ('exp_000', 0.0111, 0.0982),
    ('exp_001', 0.0123, 0.1065),
    ('exp_003', 0.0105, 0.0972),
    ('exp_005', 0.0104, 0.0969),
    ('exp_006', 0.0097, 0.0946),
    ('exp_007', 0.0093, 0.0932),
    ('exp_009', 0.0092, 0.0936),
    ('exp_012', 0.0090, 0.0913),
    ('exp_024', 0.0087, 0.0893),
    ('exp_026', 0.0085, 0.0887),
    ('exp_030', 0.0083, 0.0877),
    ('exp_041', 0.0090, 0.0932),
    ('exp_042', 0.0145, 0.1147),
]

df = pd.DataFrame(submissions, columns=['exp', 'cv', 'lb'])
print('Submission History:')
print(df.to_string(index=False))
print(f'\nBest CV: {df["cv"].min():.6f}')
print(f'Best LB: {df["lb"].min():.6f}')
print(f'Target: 0.0347')

In [None]:
# Linear regression analysis
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])
print(f'CV-LB Relationship: LB = {slope:.2f} * CV + {intercept:.4f}')
print(f'R² = {r_value**2:.4f}')
print(f'\nInterpretation:')
print(f'- Intercept ({intercept:.4f}) > Target (0.0347): {intercept > 0.0347}')
print(f'- Even with CV=0, predicted LB would be {intercept:.4f}')
print(f'\nRequired CV to hit target (using linear model):')
required_cv = (0.0347 - intercept) / slope
print(f'- Required CV: {required_cv:.6f}')
if required_cv < 0:
    print(f'- IMPOSSIBLE: Required CV is negative!')

In [None]:
# But wait - let's check if the relationship is truly linear
# Or if there's a different pattern at lower CV values

# Plot the data
plt.figure(figsize=(10, 6))
plt.scatter(df['cv'], df['lb'], s=100, alpha=0.7)
plt.plot(df['cv'], slope * df['cv'] + intercept, 'r--', label=f'Linear: LB = {slope:.2f}*CV + {intercept:.4f}')
plt.axhline(y=0.0347, color='g', linestyle=':', label='Target LB = 0.0347')
plt.xlabel('CV Score')
plt.ylabel('LB Score')
plt.title('CV vs LB Relationship')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('/home/code/exploration/cv_lb_relationship.png', dpi=100, bbox_inches='tight')
plt.show()
print('Saved to /home/code/exploration/cv_lb_relationship.png')

In [None]:
# Key insight: The intercept is high because the LB test set is fundamentally different
# from the CV test set. The CV uses leave-one-solvent-out, but the LB might have:
# 1. Different solvents entirely
# 2. Different temperature/time ranges
# 3. Different mixture compositions

# Let's analyze what we know about the data
DATA_PATH = '/home/data'
df_single = pd.read_csv(f'{DATA_PATH}/catechol_single_solvent_yields.csv')
df_full = pd.read_csv(f'{DATA_PATH}/catechol_full_data_yields.csv')

print('=== Data Overview ===')
print(f'Single solvent: {len(df_single)} samples, {df_single["SOLVENT NAME"].nunique()} solvents')
print(f'Full data: {len(df_full)} samples, {df_full[["SOLVENT A NAME", "SOLVENT B NAME"]].drop_duplicates().shape[0]} ramps')
print(f'\nSingle solvent solvents: {sorted(df_single["SOLVENT NAME"].unique())}')
print(f'\nFull data solvent A: {sorted(df_full["SOLVENT A NAME"].unique())}')
print(f'Full data solvent B: {sorted(df_full["SOLVENT B NAME"].unique())}')

In [None]:
# Temperature and time ranges
print('=== Temperature/Time Ranges ===')
print(f'Single solvent:')
print(f'  Temperature: {df_single["Temperature"].min():.1f} - {df_single["Temperature"].max():.1f}')
print(f'  Residence Time: {df_single["Residence Time"].min():.1f} - {df_single["Residence Time"].max():.1f}')
print(f'\nFull data:')
print(f'  Temperature: {df_full["Temperature"].min():.1f} - {df_full["Temperature"].max():.1f}')
print(f'  Residence Time: {df_full["Residence Time"].min():.1f} - {df_full["Residence Time"].max():.1f}')
print(f'  SolventB%: {df_full["SolventB%"].min():.1f} - {df_full["SolventB%"].max():.1f}')

In [None]:
# Target distribution analysis
print('=== Target Distribution ===')
for col in ['Product 2', 'Product 3', 'SM']:
    print(f'\n{col}:')
    print(f'  Single: mean={df_single[col].mean():.4f}, std={df_single[col].std():.4f}, range=[{df_single[col].min():.4f}, {df_single[col].max():.4f}]')
    print(f'  Full:   mean={df_full[col].mean():.4f}, std={df_full[col].std():.4f}, range=[{df_full[col].min():.4f}, {df_full[col].max():.4f}]')

In [None]:
# What approaches have been tried?
approaches_tried = [
    ('MLP architectures', 'Various depths [16] to [512,256,128,64], residual connections'),
    ('Ensemble methods', 'Bagging (3-15 models), weighted averaging, stacking'),
    ('Tree models', 'LightGBM, XGBoost, CatBoost, Random Forest'),
    ('Gaussian Process', 'GP alone and in ensemble'),
    ('GNN', 'Basic GNN and GAT - both failed'),
    ('Pre-trained', 'ChemBERTa embeddings - failed'),
    ('TabNet', 'Attention-based - failed'),
    ('Features', 'Spange, DRFP, ACS PCA, RDKit descriptors, fragprints'),
    ('Regularization', 'Dropout, weight decay, aggressive regularization'),
    ('Loss functions', 'MSE, Huber, Quantile'),
    ('Data augmentation', 'TTA for mixtures, Mixup - failed'),
    ('CV-LB gap reduction', 'GroupKFold, importance weighting - failed'),
    ('Physical constraints', 'Mass balance normalization - marginal'),
    ('Per-target optimization', 'Different weights per target'),
    ('Per-solvent-type models', 'Different models for different solvent types - failed'),
]

print('=== Approaches Tried ===')
for approach, details in approaches_tried:
    print(f'\n{approach}:')
    print(f'  {details}')

In [None]:
# What HASN'T been tried that could fundamentally change the CV-LB relationship?
print('=== Potential Unexplored Approaches ===')
unexplored = [
    ('1. Solvent-specific calibration', 'Learn a calibration factor for each solvent type'),
    ('2. Uncertainty-based prediction adjustment', 'Use GP uncertainty to adjust predictions'),
    ('3. Nearest-neighbor blending', 'Blend with predictions from most similar training solvents'),
    ('4. Multi-fidelity learning', 'Use single solvent data to inform mixture predictions'),
    ('5. Domain adaptation', 'Explicitly model the distribution shift'),
    ('6. Bayesian model averaging', 'Weight models by their uncertainty'),
    ('7. Conformal prediction with coverage guarantee', 'Ensure predictions are well-calibrated'),
    ('8. Ensemble selection', 'Select best model per sample based on similarity'),
]

for approach, details in unexplored:
    print(f'{approach}:')
    print(f'  {details}\n')

In [None]:
# CRITICAL INSIGHT: The CV-LB gap is ~10x, not ~4x as previously calculated
# Let's recalculate

print('=== CV-LB Gap Analysis ===')
for _, row in df.iterrows():
    ratio = row['lb'] / row['cv']
    print(f'{row["exp"]}: CV={row["cv"]:.4f}, LB={row["lb"]:.4f}, Ratio={ratio:.2f}x')

print(f'\nAverage ratio: {(df["lb"] / df["cv"]).mean():.2f}x')
print(f'\nIf we achieve CV=0.008194 (best), expected LB = {0.008194 * 10:.4f}')
print(f'But actual best LB = 0.0877 (from exp_030 with CV=0.0083)')
print(f'\nThe relationship is: LB ≈ 10 * CV + 0.05')

In [None]:
# Let's verify with a different regression model
from sklearn.linear_model import HuberRegressor

X = df['cv'].values.reshape(-1, 1)
y = df['lb'].values

huber = HuberRegressor()
huber.fit(X, y)
print(f'Huber Regression: LB = {huber.coef_[0]:.2f} * CV + {huber.intercept_:.4f}')

# Predict LB for best CV
best_cv = 0.008194
predicted_lb = huber.predict([[best_cv]])[0]
print(f'\nPredicted LB for best CV ({best_cv}): {predicted_lb:.4f}')
print(f'Target: 0.0347')
print(f'Gap to target: {predicted_lb - 0.0347:.4f}')

In [None]:
# CONCLUSION: The target (0.0347) requires a fundamentally different approach
# Current best: CV=0.008194 → LB=0.0877 (predicted ~0.087)
# Target: LB=0.0347
# Gap: 0.053 (60% reduction needed)

print('=== CRITICAL CONCLUSION ===')
print(f'Current best LB: 0.0877')
print(f'Target LB: 0.0347')
print(f'Required improvement: {(0.0877 - 0.0347) / 0.0877 * 100:.1f}%')
print(f'\nThe CV-LB relationship has a high intercept (~0.05) that cannot be reduced')
print(f'by simply improving CV. We need an approach that:')
print(f'1. Changes the CV-LB relationship itself')
print(f'2. Reduces the intercept, not just the slope')
print(f'3. Improves extrapolation to unseen solvents')
print(f'\nPotential approaches:')
print(f'- Ensemble of diverse models with different CV-LB relationships')
print(f'- Post-hoc calibration based on solvent similarity')
print(f'- Domain adaptation techniques')
print(f'- Meta-learning for quick adaptation to new solvents')