# Loop 60 Analysis: Post-TabNet Strategy Review

**Key Finding**: TabNet failed dramatically (CV 0.036642 vs best 0.008194 - 347% worse)

**Critical Question**: What approaches can CHANGE the CV-LB relationship?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Submission history with CV and LB scores
submissions = [
    {'exp': 'exp_000', 'cv': 0.0111, 'lb': 0.0982},
    {'exp': 'exp_001', 'cv': 0.0123, 'lb': 0.1065},
    {'exp': 'exp_003', 'cv': 0.0105, 'lb': 0.0972},
    {'exp': 'exp_005', 'cv': 0.0104, 'lb': 0.0969},
    {'exp': 'exp_006', 'cv': 0.0097, 'lb': 0.0946},
    {'exp': 'exp_007', 'cv': 0.0093, 'lb': 0.0932},
    {'exp': 'exp_009', 'cv': 0.0092, 'lb': 0.0936},
    {'exp': 'exp_012', 'cv': 0.0090, 'lb': 0.0913},
    {'exp': 'exp_024', 'cv': 0.0087, 'lb': 0.0893},
    {'exp': 'exp_026', 'cv': 0.0085, 'lb': 0.0887},
    {'exp': 'exp_030', 'cv': 0.0083, 'lb': 0.0877},
    {'exp': 'exp_041', 'cv': 0.0090, 'lb': 0.0932},
    {'exp': 'exp_042', 'cv': 0.0145, 'lb': 0.1147},
]

df = pd.DataFrame(submissions)
print('Submission History:')
print(df.to_string())
print(f'\nBest CV: {df["cv"].min():.4f} ({df.loc[df["cv"].idxmin(), "exp"]})')
print(f'Best LB: {df["lb"].min():.4f} ({df.loc[df["lb"].idxmin(), "exp"]})')

Submission History:
        exp      cv      lb
0   exp_000  0.0111  0.0982
1   exp_001  0.0123  0.1065
2   exp_003  0.0105  0.0972
3   exp_005  0.0104  0.0969
4   exp_006  0.0097  0.0946
5   exp_007  0.0093  0.0932
6   exp_009  0.0092  0.0936
7   exp_012  0.0090  0.0913
8   exp_024  0.0087  0.0893
9   exp_026  0.0085  0.0887
10  exp_030  0.0083  0.0877
11  exp_041  0.0090  0.0932
12  exp_042  0.0145  0.1147

Best CV: 0.0083 (exp_030)
Best LB: 0.0877 (exp_030)


In [2]:
# Analyze CV-LB relationship
slope, intercept, r_value, p_value, std_err = stats.linregress(df['cv'], df['lb'])
print(f'CV-LB Relationship: LB = {slope:.2f}×CV + {intercept:.4f}')
print(f'R² = {r_value**2:.4f}')
print(f'\nTarget LB: 0.0347')
print(f'Intercept: {intercept:.4f}')
print(f'Gap: Intercept - Target = {intercept - 0.0347:.4f}')

# Required CV to hit target
required_cv = (0.0347 - intercept) / slope
print(f'\nRequired CV to hit target: {required_cv:.6f}')
if required_cv < 0:
    print('IMPOSSIBLE: Required CV is negative!')
else:
    print(f'Required CV improvement: {(df["cv"].min() - required_cv) / df["cv"].min() * 100:.1f}%')

CV-LB Relationship: LB = 4.23×CV + 0.0533
R² = 0.9807

Target LB: 0.0347
Intercept: 0.0533
Gap: Intercept - Target = 0.0186

Required CV to hit target: -0.004396
IMPOSSIBLE: Required CV is negative!


In [3]:
# Analyze residuals from CV-LB relationship
df['predicted_lb'] = slope * df['cv'] + intercept
df['residual'] = df['lb'] - df['predicted_lb']
df['residual_pct'] = df['residual'] / df['predicted_lb'] * 100

print('Residual Analysis (negative = better than predicted):')
print(df[['exp', 'cv', 'lb', 'predicted_lb', 'residual', 'residual_pct']].sort_values('residual').to_string())

print(f'\nBest generalization (lowest residual): {df.loc[df["residual"].idxmin(), "exp"]}')
print(f'Worst generalization (highest residual): {df.loc[df["residual"].idxmax(), "exp"]}')

Residual Analysis (negative = better than predicted):
        exp      cv      lb  predicted_lb  residual  residual_pct
0   exp_000  0.0111  0.0982      0.100269 -0.002069     -2.062964
8   exp_024  0.0087  0.0893      0.090114 -0.000814     -0.902889
10  exp_030  0.0083  0.0877      0.088421 -0.000721     -0.815582
9   exp_026  0.0085  0.0887      0.089267 -0.000567     -0.635603
2   exp_003  0.0105  0.0972      0.097730 -0.000530     -0.542091
3   exp_005  0.0104  0.0969      0.097307 -0.000407     -0.417920
7   exp_012  0.0090  0.0913      0.091383 -0.000083     -0.090811
12  exp_042  0.0145  0.1147      0.114655  0.000045      0.039615
4   exp_006  0.0097  0.0946      0.094345  0.000255      0.270471
5   exp_007  0.0093  0.0932      0.092652  0.000548      0.591085
1   exp_001  0.0123  0.1065      0.105346  0.001154      1.095494
6   exp_009  0.0092  0.0936      0.092229  0.001371      1.486269
11  exp_041  0.0090  0.0932      0.091383  0.001817      1.988351

Best generalization (

In [4]:
# What would we need to change the CV-LB relationship?
print('=== STRATEGIC ANALYSIS ===')
print('\nCurrent CV-LB relationship: LB = 4.23×CV + 0.0533')
print('Target: 0.0347')
print('\nTo reach target, we need ONE of:')
print('1. Reduce intercept from 0.0533 to < 0.0347 (35% reduction)')
print('2. Reduce slope from 4.23 to < 0 (impossible - would mean better CV = worse LB)')
print('3. Find an approach that BREAKS the current CV-LB relationship')

print('\n=== APPROACHES TRIED (60 experiments) ===')
approaches_tried = [
    ('MLP variants', 'exp_000, exp_004, exp_005, exp_006, exp_007, exp_008, exp_010, exp_017'),
    ('LightGBM', 'exp_001, exp_002'),
    ('XGBoost', 'exp_041'),
    ('CatBoost', 'exp_047'),
    ('GP', 'exp_030, exp_031, exp_032, exp_044'),
    ('Ridge', 'exp_009, exp_033, exp_034, exp_049'),
    ('KNN', 'exp_040'),
    ('GNN', 'exp_051, exp_056'),
    ('ChemBERTa', 'exp_052'),
    ('TabNet', 'exp_061'),
    ('Ensembles', 'exp_011, exp_012, exp_013, exp_028, exp_030, exp_041, exp_045, exp_050'),
    ('Feature engineering', 'exp_003, exp_018, exp_019, exp_023, exp_024, exp_039, exp_044, exp_046, exp_048'),
    ('Per-target models', 'exp_025, exp_053'),
    ('Per-solvent-type models', 'exp_054'),
    ('Hyperparameter optimization', 'exp_055'),
    ('Physical constraints', 'exp_059'),
    ('GroupKFold CV', 'exp_042'),
]
for approach, exps in approaches_tried:
    print(f'- {approach}: {exps}')

=== STRATEGIC ANALYSIS ===

Current CV-LB relationship: LB = 4.23×CV + 0.0533
Target: 0.0347

To reach target, we need ONE of:
1. Reduce intercept from 0.0533 to < 0.0347 (35% reduction)
2. Reduce slope from 4.23 to < 0 (impossible - would mean better CV = worse LB)
3. Find an approach that BREAKS the current CV-LB relationship

=== APPROACHES TRIED (60 experiments) ===
- MLP variants: exp_000, exp_004, exp_005, exp_006, exp_007, exp_008, exp_010, exp_017
- LightGBM: exp_001, exp_002
- XGBoost: exp_041
- CatBoost: exp_047
- GP: exp_030, exp_031, exp_032, exp_044
- Ridge: exp_009, exp_033, exp_034, exp_049
- KNN: exp_040
- GNN: exp_051, exp_056
- ChemBERTa: exp_052
- TabNet: exp_061
- Ensembles: exp_011, exp_012, exp_013, exp_028, exp_030, exp_041, exp_045, exp_050
- Feature engineering: exp_003, exp_018, exp_019, exp_023, exp_024, exp_039, exp_044, exp_046, exp_048
- Per-target models: exp_025, exp_053
- Per-solvent-type models: exp_054
- Hyperparameter optimization: exp_055
- Physical

In [5]:
# What HASN'T been tried?
print('\n=== APPROACHES NOT YET TRIED ===')
print('1. Quantile regression (predict distribution, not point estimate)')
print('2. Conformal prediction (uncertainty quantification)')
print('3. Domain adaptation (explicitly model train-test shift)')
print('4. Adversarial training (make model robust to distribution shift)')
print('5. Bayesian neural networks (uncertainty-aware predictions)')
print('6. Mixture of experts (different models for different regions)')
print('7. Gradient-based meta-learning (MAML for few-shot generalization)')
print('8. Self-training / pseudo-labeling (use test predictions to improve)')
print('9. Temperature scaling (calibrate predictions)')
print('10. Ensemble selection (select best subset of models)')

print('\n=== MOST PROMISING APPROACHES ===')
print('1. Quantile regression - could provide better uncertainty estimates')
print('2. Temperature scaling - simple calibration method')
print('3. Ensemble selection - find optimal subset of diverse models')
print('4. Bayesian optimization of ensemble weights - find optimal weights')


=== APPROACHES NOT YET TRIED ===
1. Quantile regression (predict distribution, not point estimate)
2. Conformal prediction (uncertainty quantification)
3. Domain adaptation (explicitly model train-test shift)
4. Adversarial training (make model robust to distribution shift)
5. Bayesian neural networks (uncertainty-aware predictions)
6. Mixture of experts (different models for different regions)
7. Gradient-based meta-learning (MAML for few-shot generalization)
8. Self-training / pseudo-labeling (use test predictions to improve)
9. Temperature scaling (calibrate predictions)
10. Ensemble selection (select best subset of models)

=== MOST PROMISING APPROACHES ===
1. Quantile regression - could provide better uncertainty estimates
2. Temperature scaling - simple calibration method
3. Ensemble selection - find optimal subset of diverse models
4. Bayesian optimization of ensemble weights - find optimal weights


In [6]:
# Analyze what makes exp_030 the best
print('=== ANALYSIS OF BEST MODEL (exp_030) ===')
print('Model: GP (0.15) + MLP (0.55) + LGBM (0.30)')
print('CV: 0.0083')
print('LB: 0.0877')
print('LB/CV ratio: 10.57x')
print('\nKey characteristics:')
print('- GP provides uncertainty-aware predictions')
print('- MLP captures non-linear patterns')
print('- LGBM provides tree-based diversity')
print('- Weighted ensemble combines strengths')

print('\n=== WHAT COULD IMPROVE IT? ===')
print('1. Better ensemble weights (Bayesian optimization)')
print('2. More diverse base models')
print('3. Stacking with meta-learner')
print('4. Post-hoc calibration')
print('5. Uncertainty-based weighting')

=== ANALYSIS OF BEST MODEL (exp_030) ===
Model: GP (0.15) + MLP (0.55) + LGBM (0.30)
CV: 0.0083
LB: 0.0877
LB/CV ratio: 10.57x

Key characteristics:
- GP provides uncertainty-aware predictions
- MLP captures non-linear patterns
- LGBM provides tree-based diversity
- Weighted ensemble combines strengths

=== WHAT COULD IMPROVE IT? ===
1. Better ensemble weights (Bayesian optimization)
2. More diverse base models
3. Stacking with meta-learner
4. Post-hoc calibration
5. Uncertainty-based weighting


In [7]:
# Calculate what LB we could achieve with different intercepts
print('=== SENSITIVITY ANALYSIS ===')
print('\nIf we could reduce the intercept:')
for new_intercept in [0.05, 0.045, 0.04, 0.035, 0.03]:
    new_lb = slope * df['cv'].min() + new_intercept
    print(f'Intercept {new_intercept:.3f} -> LB {new_lb:.4f} (target: 0.0347)')

print('\nIf we could reduce the slope:')
for new_slope in [4.0, 3.5, 3.0, 2.5, 2.0]:
    new_lb = new_slope * df['cv'].min() + intercept
    print(f'Slope {new_slope:.1f} -> LB {new_lb:.4f} (target: 0.0347)')

print('\nTo hit target 0.0347 with current best CV (0.0083):')
required_intercept = 0.0347 - slope * df['cv'].min()
print(f'Required intercept: {required_intercept:.4f} (current: {intercept:.4f})')
print(f'Intercept reduction needed: {(intercept - required_intercept) / intercept * 100:.1f}%')

=== SENSITIVITY ANALYSIS ===

If we could reduce the intercept:
Intercept 0.050 -> LB 0.0851 (target: 0.0347)
Intercept 0.045 -> LB 0.0801 (target: 0.0347)
Intercept 0.040 -> LB 0.0751 (target: 0.0347)
Intercept 0.035 -> LB 0.0701 (target: 0.0347)
Intercept 0.030 -> LB 0.0651 (target: 0.0347)

If we could reduce the slope:
Slope 4.0 -> LB 0.0865 (target: 0.0347)
Slope 3.5 -> LB 0.0824 (target: 0.0347)
Slope 3.0 -> LB 0.0782 (target: 0.0347)
Slope 2.5 -> LB 0.0741 (target: 0.0347)
Slope 2.0 -> LB 0.0699 (target: 0.0347)

To hit target 0.0347 with current best CV (0.0083):
Required intercept: -0.0004 (current: 0.0533)
Intercept reduction needed: 100.8%


In [8]:
# Final recommendations
print('=== FINAL RECOMMENDATIONS ===')
print('\n1. DO NOT submit TabNet (CV 0.036642 is 347% worse)')
print('\n2. Focus on approaches that could CHANGE the CV-LB relationship:')
print('   - Quantile regression (different loss function)')
print('   - Temperature scaling (post-hoc calibration)')
print('   - Bayesian optimization of ensemble weights')
print('\n3. With only 3 submissions remaining:')
print('   - Save at least 1 for final attempt')
print('   - Only submit if we find a fundamentally different approach')
print('   - The goal is to reduce the intercept, not just minimize CV')

print('\n=== CRITICAL INSIGHT ===')
print('The CV-LB relationship has intercept 0.0533 > target 0.0347')
print('This means CV minimization alone CANNOT reach the target')
print('We need to find an approach that BREAKS this relationship')

=== FINAL RECOMMENDATIONS ===

1. DO NOT submit TabNet (CV 0.036642 is 347% worse)

2. Focus on approaches that could CHANGE the CV-LB relationship:
   - Quantile regression (different loss function)
   - Temperature scaling (post-hoc calibration)
   - Bayesian optimization of ensemble weights

3. With only 3 submissions remaining:
   - Save at least 1 for final attempt
   - Only submit if we find a fundamentally different approach
   - The goal is to reduce the intercept, not just minimize CV

=== CRITICAL INSIGHT ===
The CV-LB relationship has intercept 0.0533 > target 0.0347
This means CV minimization alone CANNOT reach the target
We need to find an approach that BREAKS this relationship
