# Loop 114 Analysis: CV-LB Relationship and Strategy Assessment

## Goal
Analyze the CV-LB relationship across all 24 submissions and determine the best path forward with only 3 submissions remaining.

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# All submissions with CV and LB scores
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},  # Best LB
    {'exp': 'exp_035', 'cv': 0.0098, 'lb': 0.0970},
    {'exp': 'exp_073', 'cv': 0.0084, 'lb': 0.1451},  # OUTLIER - model mismatch?
    {'exp': 'exp_111', 'cv': 0.0129, 'lb': 0.1063},
]

df = pd.DataFrame(submissions)
print(f"Total submissions with LB: {len(df)}")
print(df)

Total submissions with LB: 14
        exp      cv      lb
0   exp_000  0.0111  0.0982
1   exp_001  0.0123  0.1065
2   exp_003  0.0105  0.0972
3   exp_005  0.0104  0.0969
4   exp_006  0.0097  0.0946
5   exp_007  0.0093  0.0932
6   exp_009  0.0092  0.0936
7   exp_012  0.0090  0.0913
8   exp_024  0.0087  0.0893
9   exp_026  0.0085  0.0887
10  exp_030  0.0083  0.0877
11  exp_035  0.0098  0.0970
12  exp_073  0.0084  0.1451
13  exp_111  0.0129  0.1063


In [2]:
# Exclude outlier exp_073 (likely model class mismatch)
df_valid = df[df['exp'] != 'exp_073'].copy()
print(f"Valid submissions (excluding exp_073): {len(df_valid)}")

# Fit linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(df_valid['cv'], df_valid['lb'])

print(f"\n=== CV-LB Relationship ===")
print(f"Linear fit: LB = {slope:.4f} × CV + {intercept:.4f}")
print(f"R² = {r_value**2:.4f}")
print(f"Standard error: {std_err:.4f}")

# Target analysis
target = 0.0347
print(f"\n=== Target Analysis ===")
print(f"Target LB: {target}")
print(f"Intercept: {intercept:.4f}")
print(f"Intercept > Target? {intercept > target}")

if intercept < target:
    required_cv = (target - intercept) / slope
    print(f"Required CV to hit target: {required_cv:.6f}")
else:
    print(f"CRITICAL: Intercept ({intercept:.4f}) > Target ({target})!")
    print(f"Target is MATHEMATICALLY UNREACHABLE with approaches on this line!")
    print(f"Required CV would be: {(target - intercept) / slope:.6f} (NEGATIVE = IMPOSSIBLE)")

# Best achieved
best_cv = df_valid['cv'].min()
best_lb = df_valid['lb'].min()
print(f"\n=== Best Achieved ===")
print(f"Best CV: {best_cv:.4f}")
print(f"Best LB: {best_lb:.4f}")
print(f"Gap to target: {best_lb - target:.4f} ({(best_lb - target) / target * 100:.1f}%)")

# Expected LB for best CV
expected_lb = slope * best_cv + intercept
print(f"\nExpected LB for best CV ({best_cv:.4f}): {expected_lb:.4f}")
print(f"Actual best LB: {best_lb:.4f}")
print(f"Difference: {best_lb - expected_lb:.4f}")

Valid submissions (excluding exp_073): 13

=== CV-LB Relationship ===
Linear fit: LB = 4.0895 × CV + 0.0546
R² = 0.9607
Standard error: 0.2494

=== Target Analysis ===
Target LB: 0.0347
Intercept: 0.0546
Intercept > Target? True
CRITICAL: Intercept (0.0546) > Target (0.0347)!
Target is MATHEMATICALLY UNREACHABLE with approaches on this line!
Required CV would be: -0.004872 (NEGATIVE = IMPOSSIBLE)

=== Best Achieved ===
Best CV: 0.0083
Best LB: 0.0877
Gap to target: 0.0530 (152.7%)

Expected LB for best CV (0.0083): 0.0886
Actual best LB: 0.0877
Difference: -0.0009


In [3]:
# Analyze exp_112 (pseudo-labeling)
exp_112_cv = 0.009566
exp_112_expected_lb = slope * exp_112_cv + intercept
print(f"=== exp_112 (Pseudo-Labeling) Analysis ===")
print(f"CV: {exp_112_cv:.6f}")
print(f"Expected LB from line: {exp_112_expected_lb:.4f}")
print(f"\nIf LB ≈ {exp_112_expected_lb:.4f}: Pseudo-labeling is ON THE LINE (no improvement)")
print(f"If LB < {exp_112_expected_lb - 0.005:.4f}: Pseudo-labeling CHANGED the relationship (promising!)")
print(f"If LB > {exp_112_expected_lb + 0.005:.4f}: Something is WRONG")

# What would we need?
print(f"\n=== What We Need ===")
print(f"Target LB: {target}")
print(f"Current best LB: {best_lb:.4f}")
print(f"Improvement needed: {best_lb - target:.4f} ({(best_lb - target) / best_lb * 100:.1f}%)")
print(f"\nThis is a MASSIVE improvement - unlikely with incremental changes.")
print(f"We need to BREAK THE CV-LB LINE, not just improve CV.")

=== exp_112 (Pseudo-Labeling) Analysis ===
CV: 0.009566
Expected LB from line: 0.0937

If LB ≈ 0.0937: Pseudo-labeling is ON THE LINE (no improvement)
If LB < 0.0887: Pseudo-labeling CHANGED the relationship (promising!)
If LB > 0.0987: Something is WRONG

=== What We Need ===
Target LB: 0.0347
Current best LB: 0.0877
Improvement needed: 0.0530 (60.4%)

This is a MASSIVE improvement - unlikely with incremental changes.
We need to BREAK THE CV-LB LINE, not just improve CV.


In [4]:
# Analyze what approaches have been tried
print("=== Approaches Tried (from session_state) ===")
approaches = [
    "MLP with Arrhenius kinetics",
    "LightGBM",
    "DRFP + PCA",
    "Combined Spange + DRFP",
    "Deep Residual MLP (FAILED)",
    "Large Ensemble (15 models)",
    "Simpler Model [64, 32]",
    "CatBoost + XGBoost ensemble",
    "GNN (CV=0.026 - 3x worse)",
    "ChemBERTa (CV=0.028 - 3.5x worse)",
    "Chemical Similarity blending (exp_111)",
    "Pseudo-labeling (exp_112)",
]

for i, approach in enumerate(approaches, 1):
    print(f"{i}. {approach}")

print(f"\n=== Key Insight ===")
print("ALL tabular approaches (MLP, LGBM, XGB, CatBoost, GP, Ridge) fall on the SAME CV-LB line.")
print("GNN and ChemBERTa have WORSE CV (3x worse) - not promising.")
print("Chemical similarity blending (exp_111) was ON THE LINE.")
print("Pseudo-labeling (exp_112) is likely ON THE LINE too.")
print("\nThe problem is STRUCTURAL - test solvents are fundamentally different from training.")

=== Approaches Tried (from session_state) ===
1. MLP with Arrhenius kinetics
2. LightGBM
3. DRFP + PCA
4. Combined Spange + DRFP
5. Deep Residual MLP (FAILED)
6. Large Ensemble (15 models)
7. Simpler Model [64, 32]
8. CatBoost + XGBoost ensemble
9. GNN (CV=0.026 - 3x worse)
10. ChemBERTa (CV=0.028 - 3.5x worse)
11. Chemical Similarity blending (exp_111)
12. Pseudo-labeling (exp_112)

=== Key Insight ===
ALL tabular approaches (MLP, LGBM, XGB, CatBoost, GP, Ridge) fall on the SAME CV-LB line.
GNN and ChemBERTa have WORSE CV (3x worse) - not promising.
Chemical similarity blending (exp_111) was ON THE LINE.
Pseudo-labeling (exp_112) is likely ON THE LINE too.

The problem is STRUCTURAL - test solvents are fundamentally different from training.


In [5]:
# What's left to try?
print("=== Remaining Options ===")
print("\n1. SUBMIT exp_112 to confirm pseudo-labeling doesn't help")
print("   - Expected LB: ~0.094 (on the line)")
print("   - If on line: confirms label smoothing doesn't help")
print("   - Uses 1 of 3 remaining submissions")

print("\n2. Try DIRECT CALIBRATION")
print("   - Apply a calibration factor to predictions")
print("   - calibration_factor = target / expected_lb = 0.0347 / 0.0877 = 0.396")
print("   - This is a heuristic but might help")

print("\n3. Try UNCERTAINTY-WEIGHTED PREDICTIONS")
print("   - Train multiple models with different seeds")
print("   - Weight predictions by inverse variance")
print("   - More confident = higher weight")

print("\n4. Try CONSERVATIVE PREDICTIONS for dissimilar solvents")
print("   - Compute Tanimoto similarity to training solvents")
print("   - If low similarity, blend toward training mean")

print("\n5. REPLICATE ens-model kernel exactly")
print("   - The public kernel achieves good LB")
print("   - Uses CatBoost + XGBoost with specific weights")
print("   - Single: 7:6, Full: 1:2")

print("\n=== CRITICAL DECISION ===")
print("With only 3 submissions remaining, we need to be strategic.")
print("The target (0.0347) is 60% below our best LB (0.0877).")
print("This is a HUGE gap that requires a FUNDAMENTAL change, not optimization.")

=== Remaining Options ===

1. SUBMIT exp_112 to confirm pseudo-labeling doesn't help
   - Expected LB: ~0.094 (on the line)
   - If on line: confirms label smoothing doesn't help
   - Uses 1 of 3 remaining submissions

2. Try DIRECT CALIBRATION
   - Apply a calibration factor to predictions
   - calibration_factor = target / expected_lb = 0.0347 / 0.0877 = 0.396
   - This is a heuristic but might help

3. Try UNCERTAINTY-WEIGHTED PREDICTIONS
   - Train multiple models with different seeds
   - Weight predictions by inverse variance
   - More confident = higher weight

4. Try CONSERVATIVE PREDICTIONS for dissimilar solvents
   - Compute Tanimoto similarity to training solvents
   - If low similarity, blend toward training mean

5. REPLICATE ens-model kernel exactly
   - The public kernel achieves good LB
   - Uses CatBoost + XGBoost with specific weights
   - Single: 7:6, Full: 1:2

=== CRITICAL DECISION ===
With only 3 submissions remaining, we need to be strategic.
The target (0.0347)

In [6]:
# Final recommendation
print("=== FINAL RECOMMENDATION ===")
print("\n1. DO NOT submit exp_112 (pseudo-labeling)")
print("   - It's likely on the line (CV=0.0096 is WORSE than best CV=0.0081)")
print("   - Would waste a submission")

print("\n2. INSTEAD, try a FUNDAMENTALLY DIFFERENT approach:")
print("   a) Domain-adversarial training")
print("   b) Conformal prediction for uncertainty")
print("   c) Physics-informed constraints (Arrhenius, mass balance)")
print("   d) Scaffold-based splitting for better CV-LB alignment")

print("\n3. The benchmark paper achieved MSE 0.0039")
print("   - They used GNN with attention mechanisms")
print("   - Our GNN attempts had CV=0.026 (3x worse)")
print("   - But they might have had model class mismatches!")

print("\n4. CHECK: Did GNN/ChemBERTa experiments have correct submission cells?")
print("   - If submission cells used different model class, LB would be wrong")
print("   - This could explain why GNN/ChemBERTa didn't help")

print("\n=== IMMEDIATE ACTION ===")
print("1. Verify GNN/ChemBERTa submission cell model classes")
print("2. If they were wrong, FIX and re-run")
print("3. If they were correct, try domain-adversarial training")
print("4. Save submissions for approaches that CHANGE the CV-LB relationship")

=== FINAL RECOMMENDATION ===

1. DO NOT submit exp_112 (pseudo-labeling)
   - It's likely on the line (CV=0.0096 is WORSE than best CV=0.0081)
   - Would waste a submission

2. INSTEAD, try a FUNDAMENTALLY DIFFERENT approach:
   a) Domain-adversarial training
   b) Conformal prediction for uncertainty
   c) Physics-informed constraints (Arrhenius, mass balance)
   d) Scaffold-based splitting for better CV-LB alignment

3. The benchmark paper achieved MSE 0.0039
   - They used GNN with attention mechanisms
   - Our GNN attempts had CV=0.026 (3x worse)
   - But they might have had model class mismatches!

4. CHECK: Did GNN/ChemBERTa experiments have correct submission cells?
   - If submission cells used different model class, LB would be wrong
   - This could explain why GNN/ChemBERTa didn't help

=== IMMEDIATE ACTION ===
1. Verify GNN/ChemBERTa submission cell model classes
2. If they were wrong, FIX and re-run
3. If they were correct, try domain-adversarial training
4. Save submission