# Loop 53 Analysis: CV-LB Relationship and Strategy

**Key Question**: What is the CV-LB relationship and what strategies can reduce the gap?

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Submission history with CV and LB scores
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},
    {'exp': 'exp_041', 'cv': 0.0090, 'lb': 0.0932},
    {'exp': 'exp_042', 'cv': 0.0145, 'lb': 0.1147},
]

df = pd.DataFrame(submissions)
print('Submission History:')
print(df.to_string(index=False))

Submission History:
    exp     cv     lb
exp_000 0.0111 0.0982
exp_001 0.0123 0.1065
exp_003 0.0105 0.0972
exp_005 0.0104 0.0969
exp_006 0.0097 0.0946
exp_007 0.0093 0.0932
exp_009 0.0092 0.0936
exp_012 0.0090 0.0913
exp_024 0.0087 0.0893
exp_026 0.0085 0.0887
exp_030 0.0083 0.0877
exp_041 0.0090 0.0932
exp_042 0.0145 0.1147


In [2]:
# Fit linear regression: LB = slope * CV + intercept
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])

print(f'\n=== CV-LB LINEAR RELATIONSHIP ===')
print(f'LB = {slope:.4f} * CV + {intercept:.4f}')
print(f'R² = {r_value**2:.4f}')
print(f'Standard Error: {std_err:.4f}')

# Calculate predicted LB for each submission
df['predicted_lb'] = slope * df['cv'] + intercept
df['residual'] = df['lb'] - df['predicted_lb']
print(f'\nResiduals (actual - predicted):')
print(df[['exp', 'cv', 'lb', 'predicted_lb', 'residual']].to_string(index=False))


=== CV-LB LINEAR RELATIONSHIP ===
LB = 4.2312 * CV + 0.0533
R² = 0.9807
Standard Error: 0.1790

Residuals (actual - predicted):
    exp     cv     lb  predicted_lb  residual
exp_000 0.0111 0.0982      0.100269 -0.002069
exp_001 0.0123 0.1065      0.105346  0.001154
exp_003 0.0105 0.0972      0.097730 -0.000530
exp_005 0.0104 0.0969      0.097307 -0.000407
exp_006 0.0097 0.0946      0.094345  0.000255
exp_007 0.0093 0.0932      0.092652  0.000548
exp_009 0.0092 0.0936      0.092229  0.001371
exp_012 0.0090 0.0913      0.091383 -0.000083
exp_024 0.0087 0.0893      0.090114 -0.000814
exp_026 0.0085 0.0887      0.089267 -0.000567
exp_030 0.0083 0.0877      0.088421 -0.000721
exp_041 0.0090 0.0932      0.091383  0.001817
exp_042 0.0145 0.1147      0.114655  0.000045


In [3]:
# Target analysis
target_lb = 0.073040

print(f'\n=== TARGET ANALYSIS ===')
print(f'Target LB: {target_lb}')
print(f'Intercept: {intercept:.4f}')
print(f'Best LB so far: {df["lb"].min():.4f}')
print(f'Best CV so far: {df["cv"].min():.4f}')

# Is target reachable?
if intercept < target_lb:
    required_cv = (target_lb - intercept) / slope
    print(f'\n✓ TARGET IS REACHABLE!')
    print(f'Required CV to hit target: {required_cv:.6f}')
    print(f'Current best CV: {df["cv"].min():.6f}')
    print(f'CV improvement needed: {(df["cv"].min() - required_cv) / df["cv"].min() * 100:.1f}%')
else:
    print(f'\n✗ TARGET NOT REACHABLE with current CV-LB relationship')
    print(f'Intercept ({intercept:.4f}) > Target ({target_lb})')
    print(f'Need to CHANGE the relationship, not just improve CV')


=== TARGET ANALYSIS ===
Target LB: 0.07304
Intercept: 0.0533
Best LB so far: 0.0877
Best CV so far: 0.0083

✓ TARGET IS REACHABLE!
Required CV to hit target: 0.004665
Current best CV: 0.008300
CV improvement needed: 43.8%


In [4]:
# Analyze model types and their performance
print('\n=== MODEL TYPE ANALYSIS ===')
print('\nAll submissions fall on the same CV-LB line (R² = 0.98)')
print('This means ALL model types have the same extrapolation error!')
print('\nModel types tried:')
print('- MLP (various architectures)')
print('- LightGBM')
print('- XGBoost')
print('- GP (Gaussian Process)')
print('- Ridge Regression')
print('- Ensembles (GP+MLP+LGBM)')
print('\nAll fall on the SAME LINE - the problem is STRUCTURAL')


=== MODEL TYPE ANALYSIS ===

All submissions fall on the same CV-LB line (R² = 0.98)
This means ALL model types have the same extrapolation error!

Model types tried:
- MLP (various architectures)
- LightGBM
- XGBoost
- GP (Gaussian Process)
- Ridge Regression
- Ensembles (GP+MLP+LGBM)

All fall on the SAME LINE - the problem is STRUCTURAL


In [5]:
# What would it take to reach the target?
print('\n=== PATH TO TARGET ===')
print(f'\nCurrent best: CV={df["cv"].min():.4f}, LB={df["lb"].min():.4f}')
print(f'Target: LB={target_lb}')
print(f'Gap: {df["lb"].min() - target_lb:.4f} ({(df["lb"].min() - target_lb) / target_lb * 100:.1f}% above target)')

print(f'\nOption 1: Improve CV (stay on same line)')
required_cv = (target_lb - intercept) / slope
print(f'  Required CV: {required_cv:.6f}')
print(f'  Current CV: {df["cv"].min():.6f}')
print(f'  Improvement needed: {(df["cv"].min() - required_cv) / df["cv"].min() * 100:.1f}%')

print(f'\nOption 2: Reduce intercept (change the relationship)')
print(f'  Current intercept: {intercept:.4f}')
print(f'  If we could reduce intercept to 0.04, with same CV:')
new_lb = slope * df['cv'].min() + 0.04
print(f'  New LB would be: {new_lb:.4f}')

print(f'\nOption 3: Reduce slope (better generalization)')
print(f'  Current slope: {slope:.4f}')
print(f'  If slope was 3.0 (instead of {slope:.2f}):')
new_lb = 3.0 * df['cv'].min() + intercept
print(f'  New LB would be: {new_lb:.4f}')


=== PATH TO TARGET ===

Current best: CV=0.0083, LB=0.0877
Target: LB=0.07304
Gap: 0.0147 (20.1% above target)

Option 1: Improve CV (stay on same line)
  Required CV: 0.004665
  Current CV: 0.008300
  Improvement needed: 43.8%

Option 2: Reduce intercept (change the relationship)
  Current intercept: 0.0533
  If we could reduce intercept to 0.04, with same CV:
  New LB would be: 0.0751

Option 3: Reduce slope (better generalization)
  Current slope: 4.2312
  If slope was 3.0 (instead of 4.23):
  New LB would be: 0.0782


In [6]:
# What strategies could change the CV-LB relationship?
print('\n=== STRATEGIES TO CHANGE CV-LB RELATIONSHIP ===')
print('\n1. PUBLIC KERNEL ARCHITECTURES (not yet fully implemented):')
print('   - "best-work-here": 4-model ensemble (CatBoost+XGBoost+LightGBM+NN)')
print('   - SE attention blocks for feature recalibration')
print('   - Adaptive per-fold weight optimization')
print('   - Power-weighted ensemble (weights^2.5)')
print('   - Non-linear mixture mixing: A*(1-r) + B*r + 0.05*A*B*r*(1-r)')
print('')
print('   - "mixall": GroupKFold(5) instead of Leave-One-Out')
print('   - 4-model ensemble (MLP+XGBoost+RF+LightGBM)')
print('   - Optuna hyperparameter optimization')
print('')
print('2. DISTRIBUTION SHIFT STRATEGIES:')
print('   - Extrapolation detection features')
print('   - Uncertainty-weighted predictions')
print('   - Conservative predictions for OOD samples')
print('')
print('3. FEATURE ENGINEERING:')
print('   - Non-linear mixture features (already tried, marginal improvement)')
print('   - Spange-only features (FAILED - 104% worse)')
print('   - Combined Spange+DRFP+ACS (BEST so far)')


=== STRATEGIES TO CHANGE CV-LB RELATIONSHIP ===

1. PUBLIC KERNEL ARCHITECTURES (not yet fully implemented):
   - "best-work-here": 4-model ensemble (CatBoost+XGBoost+LightGBM+NN)
   - SE attention blocks for feature recalibration
   - Adaptive per-fold weight optimization
   - Power-weighted ensemble (weights^2.5)
   - Non-linear mixture mixing: A*(1-r) + B*r + 0.05*A*B*r*(1-r)

   - "mixall": GroupKFold(5) instead of Leave-One-Out
   - 4-model ensemble (MLP+XGBoost+RF+LightGBM)
   - Optuna hyperparameter optimization

2. DISTRIBUTION SHIFT STRATEGIES:
   - Extrapolation detection features
   - Uncertainty-weighted predictions
   - Conservative predictions for OOD samples

3. FEATURE ENGINEERING:
   - Non-linear mixture features (already tried, marginal improvement)
   - Spange-only features (FAILED - 104% worse)
   - Combined Spange+DRFP+ACS (BEST so far)


In [7]:
# Key insight from exp_054 (Spange-only)
print('\n=== KEY INSIGHT FROM EXP_054 (SPANGE-ONLY) ===')
print('\nSpange-only features (18 features) performed 104% WORSE than combined (145 features)')
print('- Single Solvent MSE: 0.009549 (16.5% worse)')
print('- Full Data MSE: 0.020592 (151% worse!)')
print('')
print('CONCLUSION: DRFP and ACS PCA features are ESSENTIAL for mixture predictions!')
print('The public kernels\' success is NOT due to simpler features.')
print('It\'s due to their ARCHITECTURE (4-model ensemble, SE attention, adaptive weights).')


=== KEY INSIGHT FROM EXP_054 (SPANGE-ONLY) ===

Spange-only features (18 features) performed 104% WORSE than combined (145 features)
- Single Solvent MSE: 0.009549 (16.5% worse)
- Full Data MSE: 0.020592 (151% worse!)

CONCLUSION: DRFP and ACS PCA features are ESSENTIAL for mixture predictions!
The public kernels' success is NOT due to simpler features.
It's due to their ARCHITECTURE (4-model ensemble, SE attention, adaptive weights).


In [8]:
# Recommended next steps
print('\n=== RECOMMENDED NEXT STEPS ===')
print('\n1. IMPLEMENT "best-work-here" ARCHITECTURE with our 145 features:')
print('   - 4-model ensemble: CatBoost + XGBoost + LightGBM + MLP')
print('   - SE attention blocks in MLP')
print('   - Adaptive per-fold weight optimization')
print('   - Power-weighted ensemble (weights^2.5)')
print('   - Non-linear mixture mixing')
print('')
print('2. ALTERNATIVE: Try GroupKFold(5) validation')
print('   - More training data per fold')
print('   - May have different CV-LB relationship')
print('   - Submit to verify if it changes the intercept')
print('')
print('3. FOCUS ON REDUCING INTERCEPT, not just improving CV')
print('   - All model types fall on the same line')
print('   - Need fundamentally different approach to change the relationship')


=== RECOMMENDED NEXT STEPS ===

1. IMPLEMENT "best-work-here" ARCHITECTURE with our 145 features:
   - 4-model ensemble: CatBoost + XGBoost + LightGBM + MLP
   - SE attention blocks in MLP
   - Adaptive per-fold weight optimization
   - Power-weighted ensemble (weights^2.5)
   - Non-linear mixture mixing

2. ALTERNATIVE: Try GroupKFold(5) validation
   - More training data per fold
   - May have different CV-LB relationship
   - Submit to verify if it changes the intercept

3. FOCUS ON REDUCING INTERCEPT, not just improving CV
   - All model types fall on the same line
   - Need fundamentally different approach to change the relationship


In [9]:
# Summary
print('\n' + '='*70)
print('LOOP 53 ANALYSIS SUMMARY')
print('='*70)
print(f'\nCV-LB Relationship: LB = {slope:.4f} * CV + {intercept:.4f} (R² = {r_value**2:.4f})')
print(f'Target: {target_lb}')
print(f'Best LB: {df["lb"].min():.4f}')
print(f'Gap to target: {(df["lb"].min() - target_lb) / target_lb * 100:.1f}%')
print(f'\nRequired CV for target: {required_cv:.6f}')
print(f'Current best CV: {df["cv"].min():.6f}')
print(f'CV improvement needed: {(df["cv"].min() - required_cv) / df["cv"].min() * 100:.1f}%')
print(f'\nKEY FINDING: Spange-only features FAILED (104% worse)')
print(f'DRFP + ACS PCA features are ESSENTIAL for mixture predictions')
print(f'\nNEXT PRIORITY: Implement "best-work-here" architecture with our features')
print('='*70)


LOOP 53 ANALYSIS SUMMARY

CV-LB Relationship: LB = 4.2312 * CV + 0.0533 (R² = 0.9807)
Target: 0.07304
Best LB: 0.0877
Gap to target: 20.1%

Required CV for target: 0.004665
Current best CV: 0.008300
CV improvement needed: 43.8%

KEY FINDING: Spange-only features FAILED (104% worse)
DRFP + ACS PCA features are ESSENTIAL for mixture predictions

NEXT PRIORITY: Implement "best-work-here" architecture with our features
