# Loop 42 Analysis: ChemBERTa Results and Strategic Assessment

**Key Question:** What can we learn from the ChemBERTa experiment and what should we try next?

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Submission history
submissions = [
    ('exp_000', 0.011081, 0.098160),
    ('exp_001', 0.012297, 0.106490),
    ('exp_003', 0.010501, 0.097190),
    ('exp_005', 0.010430, 0.096910),
    ('exp_006', 0.009749, 0.094570),
    ('exp_007', 0.009262, 0.093160),
    ('exp_009', 0.009192, 0.093640),
    ('exp_012', 0.009004, 0.091340),
    ('exp_024', 0.008689, 0.089290),
    ('exp_026', 0.008465, 0.088750),
    ('exp_030', 0.008298, 0.087720),
    ('exp_035', 0.009825, 0.096960),
]

cv_scores = np.array([s[1] for s in submissions])
lb_scores = np.array([s[2] for s in submissions])

# Linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(cv_scores, lb_scores)
print(f'CV-LB Relationship: LB = {slope:.2f} * CV + {intercept:.4f}')
print(f'R-squared = {r_value**2:.4f}')
print(f'\nTarget LB: 0.0347')
print(f'Intercept: {intercept:.4f}')
print(f'\nTo reach target LB = 0.0347:')
required_cv = (0.0347 - intercept) / slope
print(f'Required CV = {required_cv:.6f}')

In [None]:
# Analyze the gap
print('=== CV-LB Gap Analysis ===')
print()
for exp_id, cv, lb in submissions:
    gap = lb - cv
    ratio = lb / cv
    print(f'{exp_id}: CV={cv:.6f}, LB={lb:.6f}, Gap={gap:.4f}, Ratio={ratio:.1f}x')

print(f'\nAverage gap: {np.mean(lb_scores - cv_scores):.4f}')
print(f'Average ratio: {np.mean(lb_scores / cv_scores):.1f}x')

In [None]:
# ChemBERTa experiment results
print('=== ChemBERTa Experiment Results (exp_041) ===')
print()
print('Single fold (1,1,1,3,3,3-Hexafluoropropan-2-ol):')
print('  ChemBERTa only (768-dim): MSE = 0.061135')
print('  ChemBERTa PCA (20-dim): MSE = 0.058003')
print('  ChemBERTa + Spange: MSE = 0.042313')
print('  ChemBERTa PCA + Spange: MSE = 0.041895')
print()
print('  Spange only: MSE = 0.034241')
print('  DRFP only: MSE = 0.059665')
print('  Spange + DRFP: MSE = 0.057212')
print()
print('Full CV (ChemBERTa PCA + Spange):')
print('  Mean MSE: 0.010288 +/- 0.008427')
print('  Baseline (exp_035): CV = 0.008194')
print()
print('CONCLUSION: ChemBERTa is 25.5% WORSE than baseline')

In [None]:
# What have we tried that DIDN'T work?
print('=== Approaches That FAILED ===')
print()
failed_approaches = [
    ('DRFP with PCA (exp_002)', 0.016948, 'DRFP alone loses information'),
    ('Deep Residual MLP (exp_004)', 0.051912, 'Too complex for small data'),
    ('Minimal features (exp_038)', 0.009825, 'Need DRFP features'),
    ('GNN (exp_040)', 0.068767, 'Single fold only - very poor'),
    ('ChemBERTa (exp_041)', 0.010288, 'Pre-trained embeddings not helpful'),
    ('k-NN with Tanimoto', 0.072666, 'Single fold only - very poor'),
]

for name, score, reason in failed_approaches:
    print(f'{name}: CV = {score:.6f}')
    print(f'  Reason: {reason}')
    print()

In [None]:
# What HAS worked?
print('=== Approaches That WORKED ===')
print()
working_approaches = [
    ('Spange descriptors', 'Physicochemical properties relevant to solvation'),
    ('DRFP features (high-variance)', 'Molecular structure fingerprints'),
    ('Arrhenius kinetics', 'Physics-informed features'),
    ('GP component', 'Different inductive bias, helps with uncertainty'),
    ('MLP + LGBM ensemble', 'Complementary model types'),
    ('Weighted loss [1,1,2] for SM', 'SM is harder to predict'),
    ('ACS PCA features', 'Additional chemical descriptors'),
]

for name, reason in working_approaches:
    print(f'{name}')
    print(f'  Why: {reason}')
    print()

In [None]:
# The critical insight: CV-LB intercept problem
print('=== THE CRITICAL PROBLEM ===')
print()
print(f'CV-LB relationship: LB = {slope:.2f} * CV + {intercept:.4f}')
print(f'Target LB: 0.0347')
print(f'Intercept: {intercept:.4f}')
print()
print('PROBLEM: Intercept (0.0524) > Target (0.0347)')
print('This means even with CV = 0, predicted LB would be 0.0524')
print()
print('IMPLICATION: We need to CHANGE the CV-LB relationship, not just improve CV')
print()
print('Possible causes of high intercept:')
print('1. Systematic overfitting to training solvents')
print('2. Mismatch between local CV and Kaggle evaluation')
print('3. Distribution shift between train/test')
print('4. Model miscalibration')

In [None]:
# What approaches might change the CV-LB relationship?
print('=== APPROACHES TO CHANGE CV-LB RELATIONSHIP ===')
print()
approaches = [
    ('Prediction calibration', 'Adjust predictions to reduce systematic bias'),
    ('Ensemble with constant offset', 'Add a learned offset to predictions'),
    ('Different CV scheme', 'Verify our CV matches Kaggle exactly'),
    ('Regularization tuning', 'Stronger regularization to prevent overfitting'),
    ('Feature selection', 'Remove features that cause overfitting'),
    ('Domain adaptation', 'Explicitly model train-test distribution shift'),
]

for name, desc in approaches:
    print(f'{name}')
    print(f'  {desc}')
    print()

In [None]:
# Best experiment analysis
print('=== BEST EXPERIMENT: exp_030 (GP+MLP+LGBM) ===')
print()
print('CV: 0.008298')
print('LB: 0.087720')
print('Gap: 0.0794 (9.6x)')
print()
print('Components:')
print('  GP (0.2): Matern kernel on Spange + Arrhenius (18 features)')
print('  MLP (0.5): [32,16] with weighted loss on full features (145 features)')
print('  LGBM (0.3): Full features (145 features)')
print()
print('Key insight: GP provides different inductive bias')
print('GP may have better uncertainty calibration for OOD solvents')

In [None]:
# What should we try next?
print('=== RECOMMENDED NEXT STEPS ===')
print()
print('Priority 1: Prediction Calibration')
print('  - Try Platt scaling or isotonic regression on predictions')
print('  - May reduce the intercept in CV-LB relationship')
print()
print('Priority 2: Verify CV Scheme')
print('  - Read template notebook carefully')
print('  - Ensure we leave out full experiments for mixtures')
print('  - Check weighting between single-solvent and mixture data')
print()
print('Priority 3: Stronger Regularization')
print('  - Increase dropout, weight decay')
print('  - Reduce model complexity further')
print('  - May reduce overfitting to training solvents')
print()
print('Priority 4: GP-Heavy Ensemble')
print('  - Increase GP weight (currently 0.2)')
print('  - GP may generalize better to unseen solvents')
print('  - Try pure GP model')

In [None]:
# Final summary
print('=== LOOP 42 SUMMARY ===')
print()
print('ChemBERTa experiment (exp_041):')
print('  - CV = 0.010288 (25.5% WORSE than baseline)')
print('  - Pre-trained molecular embeddings do NOT help')
print('  - Domain-specific Spange descriptors remain superior')
print('  - Correctly decided NOT to submit')
print()
print('Current best:')
print('  - exp_030: CV = 0.008298, LB = 0.087720')
print('  - Target: 0.0347 (2.53x gap)')
print()
print('Remaining submissions: 4')
print()
print('Critical insight:')
print('  - CV-LB intercept (0.0524) > Target (0.0347)')
print('  - Need to change the relationship, not just improve CV')
print('  - Focus on calibration, regularization, or different approach')