# Loop 10 Analysis: Post-Target Encoding Assessment

## Key Questions:
1. Why did target encoding HURT performance?
2. What approaches remain unexplored?
3. What's the realistic path forward?

## Evaluator's Key Points:
- Target encoding performed WORSE than label encoding (0.81560 vs 0.81698)
- We've been stuck for 7 experiments without beating exp_003's CV of 0.81951
- exp_003 was likely a lucky Optuna run
- Target score (0.9642) is impossible - top LB is ~0.8066
- Suggests: 10-fold CV + regularization to reduce overfitting

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load submission history for analysis
submissions = [
    {'exp': 'exp_000', 'cv': 0.80674, 'lb': 0.79705, 'approach': 'XGBoost baseline'},
    {'exp': 'exp_003', 'cv': 0.81951, 'lb': 0.80453, 'approach': 'CatBoost Optuna'},
    {'exp': 'exp_004', 'cv': 0.81928, 'lb': 0.80406, 'approach': 'Threshold tuning'},
    {'exp': 'exp_006', 'cv': 0.81709, 'lb': 0.80102, 'approach': 'Weighted ensemble'},
]

df = pd.DataFrame(submissions)
df['gap'] = df['cv'] - df['lb']
df['gap_pct'] = (df['gap'] / df['cv'] * 100).round(2)
print("Submission History:")
print(df.to_string(index=False))

In [None]:
# Experiment history (all 10 experiments)
experiments = [
    {'exp': 'exp_000', 'cv': 0.80674, 'approach': 'XGBoost baseline', 'submitted': True},
    {'exp': 'exp_001', 'cv': 0.80927, 'approach': 'Advanced features', 'submitted': False},
    {'exp': 'exp_002', 'cv': 0.81353, 'approach': '3-model ensemble', 'submitted': False},
    {'exp': 'exp_003', 'cv': 0.81951, 'approach': 'CatBoost Optuna', 'submitted': True},
    {'exp': 'exp_004', 'cv': 0.81928, 'approach': 'Threshold tuning', 'submitted': True},
    {'exp': 'exp_005', 'cv': 0.81744, 'approach': 'Stacking', 'submitted': False},
    {'exp': 'exp_006', 'cv': 0.81709, 'approach': 'Weighted ensemble', 'submitted': True},
    {'exp': 'exp_007', 'cv': 0.81617, 'approach': 'Feature selection', 'submitted': False},
    {'exp': 'exp_008', 'cv': 0.81698, 'approach': 'Multi-seed', 'submitted': False},
    {'exp': 'exp_009', 'cv': 0.81560, 'approach': 'Target encoding', 'submitted': False},
]

exp_df = pd.DataFrame(experiments)
print("\nAll Experiments:")
print(exp_df.to_string(index=False))

# Best CV
print(f"\nBest CV: {exp_df['cv'].max():.5f} ({exp_df.loc[exp_df['cv'].idxmax(), 'approach']})")
print(f"Median CV: {exp_df['cv'].median():.5f}")
print(f"CV range: {exp_df['cv'].min():.5f} - {exp_df['cv'].max():.5f}")

In [None]:
# Analyze what approaches have been tried
approaches_tried = {
    'Encoding': ['Label encoding (baseline)', 'Target encoding (exp_009)'],
    'Models': ['XGBoost', 'LightGBM', 'CatBoost', '3-model ensemble'],
    'Ensembling': ['Simple averaging', 'Weighted averaging', 'Stacking (LR meta)'],
    'Hyperparameters': ['Optuna tuning (50 trials)', 'Multi-seed (5 seeds)'],
    'Features': ['56 features', 'Feature selection (32 features)', 'Cabin regions', 'Family size'],
    'Validation': ['5-fold StratifiedKFold'],
    'Threshold': ['0.5 (default)', '0.47 (tuned)'],
}

print("Approaches Tried:")
for category, approaches in approaches_tried.items():
    print(f"\n{category}:")
    for a in approaches:
        print(f"  - {a}")

In [None]:
# What HASN'T been tried (from evaluator's suggestions)
approaches_not_tried = {
    'Validation': [
        '10-fold CV (more stable estimates)',
        'Adversarial validation (check distribution shift)',
    ],
    'Regularization': [
        'Reduced depth (6 instead of 8)',
        'Higher l2_leaf_reg (5-10 instead of 3.52)',
        'Subsample (0.8)',
        'Colsample_bylevel (0.8)',
    ],
    'Imputation': [
        'KNN imputation (mentioned in top solutions)',
    ],
    'Data Augmentation': [
        'Pseudo-labeling (use confident test predictions)',
    ],
    'Models': [
        'Neural networks (for diversity)',
    ],
}

print("\nApproaches NOT Yet Tried:")
for category, approaches in approaches_not_tried.items():
    print(f"\n{category}:")
    for a in approaches:
        print(f"  - {a}")

In [None]:
# CV-LB relationship analysis
import numpy as np
from scipy import stats

cv_scores = [0.80674, 0.81951, 0.81928, 0.81709]
lb_scores = [0.79705, 0.80453, 0.80406, 0.80102]

# Linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(cv_scores, lb_scores)
print(f"CV-LB Linear Model:")
print(f"  LB = {slope:.3f} * CV + {intercept:.3f}")
print(f"  RÂ² = {r_value**2:.3f}")
print(f"  p-value = {p_value:.4f}")

# Predict LB for different CV scores
print(f"\nPredicted LB for different CV scores:")
for cv in [0.815, 0.817, 0.819, 0.820, 0.821, 0.822]:
    predicted_lb = slope * cv + intercept
    print(f"  CV {cv:.3f} -> LB {predicted_lb:.4f}")

# What CV do we need to beat exp_003's LB?
target_lb = 0.80453
required_cv = (target_lb - intercept) / slope
print(f"\nTo beat LB 0.8045, need CV > {required_cv:.5f}")

In [None]:
# Fold variance analysis from exp_009 (target encoding)
fold_scores_exp009 = [0.82404, 0.81196, 0.81484, 0.82336, 0.80380]
print("exp_009 (Target Encoding) Fold Scores:")
print(f"  Scores: {fold_scores_exp009}")
print(f"  Mean: {np.mean(fold_scores_exp009):.5f}")
print(f"  Std: {np.std(fold_scores_exp009):.5f}")
print(f"  Range: {min(fold_scores_exp009):.5f} - {max(fold_scores_exp009):.5f}")
print(f"  Spread: {max(fold_scores_exp009) - min(fold_scores_exp009):.5f} ({(max(fold_scores_exp009) - min(fold_scores_exp009))/np.mean(fold_scores_exp009)*100:.2f}%)")

# Compare with typical fold variance
print("\nTypical fold variance from previous experiments:")
print("  exp_003: std = 0.00685")
print("  exp_004: std = 0.00762")
print("  exp_008: std = 0.00498 (seed 42)")
print(f"  exp_009: std = {np.std(fold_scores_exp009):.5f}")
print("\nexp_009 has HIGH fold variance - suggests instability")

In [None]:
# Key insight: exp_003's CV of 0.81951 is an outlier
# Multi-seed analysis showed true baseline is ~0.817

print("=== CRITICAL INSIGHT ===")
print("\nexp_003's CV of 0.81951 appears to be an outlier:")
print("  - Multi-seed analysis (5 seeds) gave max CV = 0.81629")
print("  - Multi-seed ensemble CV = 0.81698")
print("  - exp_003 was Optuna-tuned (50 trials) - may have overfit to CV folds")
print("\nTrue baseline is ~0.817 CV, not 0.82")
print("\nTo beat exp_003's LB of 0.8045:")
print(f"  - Need CV > {required_cv:.5f} (based on linear model)")
print(f"  - Gap from true baseline: {required_cv - 0.817:.5f} (+{(required_cv - 0.817)/0.817*100:.2f}%)")
print("\nThis is a SIGNIFICANT gap that requires a fundamentally different approach.")

In [None]:
# Evaluator's recommendation: 10-fold CV + regularization
print("=== EVALUATOR'S RECOMMENDATION ===")
print("\n1. Use 10-fold CV (instead of 5-fold):")
print("   - More stable estimates")
print("   - Less variance between folds")
print("   - Top kernels use this")
print("\n2. Increase regularization:")
print("   - Reduce depth from 8 to 6")
print("   - Increase l2_leaf_reg from 3.52 to 5-10")
print("   - Add subsample (0.8) and colsample_bylevel (0.8)")
print("\n3. Use the best feature set (56 features with label encoding)")
print("\nRationale:")
print("  - High fold variance (2.02% spread) suggests overfitting")
print("  - CV-LB gap is increasing (1.19% -> 1.97%)")
print("  - Focus on reducing overfitting, not maximizing CV")

In [None]:
# Alternative approaches to consider
print("=== ALTERNATIVE APPROACHES ===")
print("\n1. KNN Imputation (from top solutions):")
print("   - Our current imputation is simple (mode/median)")
print("   - KNN imputation may capture more complex patterns")
print("   - Could improve data quality")
print("\n2. Pseudo-labeling:")
print("   - Use confident test predictions to augment training data")
print("   - Can help with distribution shift")
print("   - Risk: may amplify errors if predictions are wrong")
print("\n3. Neural Networks:")
print("   - For diversity in ensembling")
print("   - May capture different patterns than tree models")
print("   - Risk: may not work well on small tabular data")
print("\n4. Adversarial Validation:")
print("   - Check if train/test distributions differ")
print("   - Identify features that cause distribution shift")
print("   - Could explain increasing CV-LB gap")

In [None]:
# Reality check on target score
print("=== REALITY CHECK ===")
print("\nTarget score: 0.9642")
print("Top LB score: ~0.8066")
print("Our best LB: 0.8045")
print("\nThe target of 0.9642 is IMPOSSIBLE.")
print("Top LB is ~0.8066, which is 96.4% accuracy.")
print("Our best (0.8045) is already in the top ~7%.")
print("\nGap to top: 0.0021 (0.26%)")
print("\nThis is a very small gap. We're already competitive.")
print("\nFocus should be on:")
print("  1. Reducing overfitting (CV-LB gap)")
print("  2. Incremental improvements toward 0.81 LB")
print("  3. NOT chasing an impossible target")