# Loop 80 Analysis: CV-LB Relationship and Strategy Assessment

**Goal**: Analyze the CV-LB relationship across all submissions and identify strategies to break the pattern.

**Key Questions**:
1. What is the exact CV-LB relationship?
2. Are there any outliers that deviate from the line?
3. What approaches might change the intercept?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# All submissions with both CV and LB scores
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},
    {'exp': 'exp_035', 'cv': 0.0098, 'lb': 0.0970},
]

df = pd.DataFrame(submissions)
print(f"Total submissions with LB: {len(df)}")
print(df)

In [None]:
# Fit linear regression: LB = slope * CV + intercept
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])

print(f"\n=== CV-LB Relationship ===")
print(f"Linear fit: LB = {slope:.4f} * CV + {intercept:.6f}")
print(f"RÂ² = {r_value**2:.4f}")
print(f"Standard error: {std_err:.4f}")

# Calculate residuals
df['predicted_lb'] = slope * df['cv'] + intercept
df['residual'] = df['lb'] - df['predicted_lb']
print(f"\nResiduals (LB - predicted):")
print(df[['exp', 'cv', 'lb', 'predicted_lb', 'residual']].to_string())

In [None]:
# Target analysis
target = 0.0347

print(f"\n=== Target Analysis ===")
print(f"Target LB: {target}")
print(f"Intercept: {intercept:.6f}")
print(f"Intercept vs Target: {intercept:.6f} > {target} = {intercept > target}")

if intercept > target:
    print(f"\n*** CRITICAL: Intercept ({intercept:.6f}) > Target ({target}) ***")
    print(f"Even with CV=0, predicted LB would be {intercept:.6f}")
    print(f"This means the target is UNREACHABLE with current approaches!")
else:
    required_cv = (target - intercept) / slope
    print(f"Required CV to reach target: {required_cv:.6f}")
    print(f"Best CV so far: {df['cv'].min():.6f}")
    print(f"Gap: {(df['cv'].min() - required_cv) / required_cv * 100:.1f}%")

In [None]:
# Analyze outliers - which submissions deviate most from the line?
print(f"\n=== Outlier Analysis ===")
df_sorted = df.sort_values('residual')
print("Submissions sorted by residual (negative = better than predicted):")
print(df_sorted[['exp', 'cv', 'lb', 'predicted_lb', 'residual']].to_string())

print(f"\nBest outlier (most negative residual): {df_sorted.iloc[0]['exp']}")
print(f"  CV: {df_sorted.iloc[0]['cv']:.6f}")
print(f"  LB: {df_sorted.iloc[0]['lb']:.6f}")
print(f"  Predicted LB: {df_sorted.iloc[0]['predicted_lb']:.6f}")
print(f"  Residual: {df_sorted.iloc[0]['residual']:.6f}")

In [None]:
# What would it take to reach the target?
print(f"\n=== Path to Target ===")
print(f"Target: {target}")
print(f"Best LB: {df['lb'].min():.6f} (exp_030)")
print(f"Gap: {(df['lb'].min() - target) / target * 100:.1f}%")

# If we could reduce the intercept
print(f"\nScenario 1: Reduce intercept to 0.02 (keep slope)")
new_intercept = 0.02
required_cv_1 = (target - new_intercept) / slope
print(f"  Required CV: {required_cv_1:.6f}")
print(f"  Achievable? Best CV is {df['cv'].min():.6f}")

print(f"\nScenario 2: Reduce slope to 2.0 (keep intercept)")
new_slope = 2.0
required_cv_2 = (target - intercept) / new_slope
print(f"  Required CV: {required_cv_2:.6f}")
print(f"  Achievable? {required_cv_2 > 0}")

print(f"\nScenario 3: Both intercept=0.02 and slope=2.0")
required_cv_3 = (target - 0.02) / 2.0
print(f"  Required CV: {required_cv_3:.6f}")
print(f"  Achievable? Best CV is {df['cv'].min():.6f}")

In [None]:
# Analyze pending submissions
pending = [
    {'exp': 'exp_049', 'cv': 0.0081},
    {'exp': 'exp_050', 'cv': 0.0081},
    {'exp': 'exp_052', 'cv': 0.0109},
    {'exp': 'exp_053', 'cv': 0.0081},
    {'exp': 'exp_054', 'cv': 0.0085},
    {'exp': 'exp_055', 'cv': 0.0085},
    {'exp': 'exp_057', 'cv': 0.0093},
    {'exp': 'exp_063', 'cv': 0.0112},
]

print(f"\n=== Pending Submissions ===")
for p in pending:
    predicted_lb = slope * p['cv'] + intercept
    print(f"{p['exp']}: CV={p['cv']:.4f} -> Predicted LB={predicted_lb:.4f}")

print(f"\nBest pending CV: 0.0081")
print(f"Predicted LB for CV=0.0081: {slope * 0.0081 + intercept:.4f}")
print(f"This would be slightly better than exp_030 (LB=0.0877) if on same line")

In [None]:
# What approaches have been tried?
print(f"\n=== Approaches Tried (80 experiments) ===")
approaches = [
    "MLP with various architectures",
    "LightGBM",
    "XGBoost",
    "CatBoost",
    "Random Forest",
    "Ridge Regression",
    "Kernel Ridge",
    "Gaussian Process",
    "GNN (Graph Neural Network)",
    "ChemBERTa embeddings",
    "Ensembles (MLP+LGBM+GP, etc.)",
    "Feature engineering (Spange, DRFP, ACS-PCA, Fragprints)",
    "Arrhenius kinetics features",
    "TTA (Test Time Augmentation)",
    "Data augmentation for mixtures",
    "Output normalization",
    "Extrapolation detection",
    "Similarity weighting",
    "Per-target models",
    "GroupKFold(5) validation",
]

for i, approach in enumerate(approaches, 1):
    print(f"{i}. {approach}")

print(f"\nAll approaches fall on the SAME CV-LB line!")
print(f"This indicates STRUCTURAL distribution shift, not model inadequacy.")

In [None]:
# What hasn't been tried that might change the CV-LB relationship?
print(f"\n=== Potential Approaches to Change CV-LB Relationship ===")

untried = [
    "1. DOMAIN CONSTRAINTS: Force predictions to sum to 1 (yield constraint)",
    "2. PSEUDO-LABELING: Use confident test predictions to augment training",
    "3. ADVERSARIAL VALIDATION: Identify and weight samples similar to test",
    "4. SOLVENT SIMILARITY FEATURES: Add features measuring distance to training solvents",
    "5. CONSERVATIVE PREDICTIONS: Blend toward mean when extrapolating",
    "6. MULTI-TASK LEARNING: Share representations across targets",
    "7. TEMPERATURE SCALING: Calibrate predictions post-hoc",
    "8. MIXTURE-OF-EXPERTS: Different models for different solvent types",
]

for approach in untried:
    print(approach)

print(f"\n*** KEY INSIGHT ***")
print(f"The intercept (0.053) represents EXTRAPOLATION ERROR.")
print(f"To reduce it, we need approaches that:")
print(f"  1. Better generalize to unseen solvents")
print(f"  2. Make conservative predictions when uncertain")
print(f"  3. Exploit domain constraints (yields sum to 1)")