# Loop 25 Analysis: Submission Decision

**Goal**: Analyze the current state and decide whether to submit exp_024

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Submission history
submissions = [
    {'exp': 'exp_000', 'cv': 0.011081, 'lb': 0.09816},
    {'exp': 'exp_001', 'cv': 0.012297, 'lb': 0.10649},
    {'exp': 'exp_003', 'cv': 0.010501, 'lb': 0.09719},
    {'exp': 'exp_005', 'cv': 0.010430, 'lb': 0.09691},
    {'exp': 'exp_006', 'cv': 0.009749, 'lb': 0.09457},
    {'exp': 'exp_007', 'cv': 0.009262, 'lb': 0.09316},
    {'exp': 'exp_009', 'cv': 0.009192, 'lb': 0.09364},
    {'exp': 'exp_012', 'cv': 0.009004, 'lb': 0.09134},
]

df = pd.DataFrame(submissions)
print('=== Submission History ===')
print(df.to_string(index=False))

=== Submission History ===
    exp       cv      lb
exp_000 0.011081 0.09816
exp_001 0.012297 0.10649
exp_003 0.010501 0.09719
exp_005 0.010430 0.09691
exp_006 0.009749 0.09457
exp_007 0.009262 0.09316
exp_009 0.009192 0.09364
exp_012 0.009004 0.09134


In [2]:
# Linear regression to predict LB from CV
from scipy.stats import linregress

slope, intercept, r_value, p_value, std_err = linregress(df['cv'], df['lb'])
print(f'\n=== CV-LB Linear Relationship ===')
print(f'LB = {slope:.4f} * CV + {intercept:.4f}')
print(f'R² = {r_value**2:.4f}')
print(f'Std Error (slope): {std_err:.4f}')

# Predict LB for exp_024
cv_exp024 = 0.008689
predicted_lb = slope * cv_exp024 + intercept
print(f'\n=== Prediction for exp_024 ===')
print(f'CV: {cv_exp024:.6f}')
print(f'Predicted LB: {predicted_lb:.4f}')
print(f'Best LB so far: 0.0913 (exp_012)')
print(f'Expected improvement: {(0.0913 - predicted_lb) / 0.0913 * 100:.2f}%')


=== CV-LB Linear Relationship ===
LB = 4.0432 * CV + 0.0552
R² = 0.9461
Std Error (slope): 0.3941

=== Prediction for exp_024 ===
CV: 0.008689
Predicted LB: 0.0904
Best LB so far: 0.0913 (exp_012)
Expected improvement: 1.02%


In [3]:
# Calculate confidence interval for prediction
from scipy.stats import t

n = len(df)
x_mean = df['cv'].mean()
ss_x = ((df['cv'] - x_mean) ** 2).sum()

# Standard error of prediction
residuals = df['lb'] - (slope * df['cv'] + intercept)
mse = (residuals ** 2).sum() / (n - 2)
se_pred = np.sqrt(mse * (1 + 1/n + (cv_exp024 - x_mean)**2 / ss_x))

# 95% confidence interval
t_crit = t.ppf(0.975, n - 2)
lb_lower = predicted_lb - t_crit * se_pred
lb_upper = predicted_lb + t_crit * se_pred

print(f'\n=== 95% Prediction Interval for exp_024 ===')
print(f'Predicted LB: {predicted_lb:.4f}')
print(f'95% CI: [{lb_lower:.4f}, {lb_upper:.4f}]')
print(f'\nInterpretation: LB could be anywhere from {lb_lower:.4f} to {lb_upper:.4f}')


=== 95% Prediction Interval for exp_024 ===
Predicted LB: 0.0904
95% CI: [0.0870, 0.0937]

Interpretation: LB could be anywhere from 0.0870 to 0.0937


In [4]:
# Compare exp_024 to exp_012 (best LB)
print('=== Comparison: exp_024 vs exp_012 ===')
print(f'exp_012: CV {0.009004:.6f} -> LB 0.0913')
print(f'exp_024: CV {cv_exp024:.6f} -> Predicted LB {predicted_lb:.4f}')
print(f'\nCV improvement: {(0.009004 - cv_exp024) / 0.009004 * 100:.2f}%')
print(f'Predicted LB improvement: {(0.0913 - predicted_lb) / 0.0913 * 100:.2f}%')

# Check if exp_024 is worth submitting
print(f'\n=== Submission Decision ===')
if cv_exp024 < 0.009004:
    print(f'✓ exp_024 CV ({cv_exp024:.6f}) is BETTER than exp_012 CV ({0.009004:.6f})')
    print(f'✓ Predicted LB ({predicted_lb:.4f}) is BETTER than exp_012 LB (0.0913)')
    print(f'\n→ RECOMMEND SUBMISSION: exp_024 should improve LB')
else:
    print(f'✗ exp_024 CV is worse than exp_012 - do not submit')

=== Comparison: exp_024 vs exp_012 ===
exp_012: CV 0.009004 -> LB 0.0913
exp_024: CV 0.008689 -> Predicted LB 0.0904

CV improvement: 3.50%
Predicted LB improvement: 1.02%

=== Submission Decision ===
✓ exp_024 CV (0.008689) is BETTER than exp_012 CV (0.009004)
✓ Predicted LB (0.0904) is BETTER than exp_012 LB (0.0913)

→ RECOMMEND SUBMISSION: exp_024 should improve LB


In [5]:
# Target analysis
target = 0.017270
print(f'\n=== Target Analysis ===')
print(f'Target: {target}')
print(f'Best LB: 0.0913')
print(f'Gap: {0.0913 / target:.2f}x (LB is {0.0913 / target:.2f}x worse than target)')

# What CV would we need to reach target?
required_cv = (target - intercept) / slope
print(f'\nTo reach target with current CV-LB relationship:')
print(f'Required CV: {required_cv:.6f}')
if required_cv < 0:
    print(f'\n⚠️ WARNING: Required CV is NEGATIVE ({required_cv:.6f})')
    print(f'This suggests the linear relationship may not hold for very low CV values')
    print(f'OR the target may require a fundamentally different approach')


=== Target Analysis ===
Target: 0.01727
Best LB: 0.0913
Gap: 5.29x (LB is 5.29x worse than target)

To reach target with current CV-LB relationship:
Required CV: -0.009390

This suggests the linear relationship may not hold for very low CV values
OR the target may require a fundamentally different approach


In [6]:
# Unexplored approaches
print('\n=== Unexplored High-Leverage Approaches ===')
print('1. Per-target models (SM vs Products) - Competition explicitly allows this')
print('2. Stacking meta-learner (learn optimal weights from OOF predictions)')
print('3. Non-linear mixture encoding (polynomial or learned combination)')
print('4. Larger ensemble (7-10 models instead of 5)')
print('5. Different loss functions per target (SM may need different loss)')

print('\n=== Already Exhausted ===')
print('- Attention mechanisms (159% worse)')
print('- Fragprints (8.28% worse)')
print('- Deep residual networks (5x worse)')
print('- Very large ensembles (15+ models, only 0.7% improvement)')
print('- Single-layer networks (too simple)')


=== Unexplored High-Leverage Approaches ===
1. Per-target models (SM vs Products) - Competition explicitly allows this
2. Stacking meta-learner (learn optimal weights from OOF predictions)
3. Non-linear mixture encoding (polynomial or learned combination)
4. Larger ensemble (7-10 models instead of 5)
5. Different loss functions per target (SM may need different loss)

=== Already Exhausted ===
- Attention mechanisms (159% worse)
- Fragprints (8.28% worse)
- Deep residual networks (5x worse)
- Very large ensembles (15+ models, only 0.7% improvement)
- Single-layer networks (too simple)
