# Loop 4 LB Feedback Analysis

## CRITICAL: XGBoost FAILED BADLY on LB

**Submission History:**
- exp_000: CV 0.8339 → LB 0.7799 (BEST LB, gap: +5.4%)
- exp_001: CV 0.8271 → LB 0.7727 (gap: +5.4%)
- exp_002: CV 0.8361 → LB 0.7703 (gap: +6.6%)
- exp_003: CV 0.8361 → LB 0.7584 (gap: +7.8%) ← WORST LB!

**Key Insight:** XGBoost performed TERRIBLY despite same features as exp_000.
The 27 prediction changes (6.5%) were WRONG, not right.

**Only 1 submission remaining!**

In [None]:
import pandas as pd
import numpy as np

# Load all candidate predictions
exp_000 = pd.read_csv('/home/code/submission_candidates/candidate_000.csv')
exp_001 = pd.read_csv('/home/code/submission_candidates/candidate_001.csv')
exp_002 = pd.read_csv('/home/code/submission_candidates/candidate_002.csv')
exp_003 = pd.read_csv('/home/code/submission_candidates/candidate_003.csv')

print("Prediction distributions:")
print(f"exp_000 (LB 0.7799): {exp_000['Survived'].value_counts().to_dict()}")
print(f"exp_001 (LB 0.7727): {exp_001['Survived'].value_counts().to_dict()}")
print(f"exp_002 (LB 0.7703): {exp_002['Survived'].value_counts().to_dict()}")
print(f"exp_003 (LB 0.7584): {exp_003['Survived'].value_counts().to_dict()}")

In [None]:
# Analyze prediction differences
all_preds = exp_000.copy()
all_preds['exp_000'] = exp_000['Survived']
all_preds['exp_001'] = exp_001['Survived']
all_preds['exp_002'] = exp_002['Survived']
all_preds['exp_003'] = exp_003['Survived']
all_preds = all_preds.drop('Survived', axis=1)

# Agreement analysis
print("\nPrediction Agreement Analysis:")
print(f"exp_000 vs exp_001: {(all_preds['exp_000'] == all_preds['exp_001']).sum()}/418 ({(all_preds['exp_000'] == all_preds['exp_001']).mean()*100:.1f}%)")
print(f"exp_000 vs exp_002: {(all_preds['exp_000'] == all_preds['exp_002']).sum()}/418 ({(all_preds['exp_000'] == all_preds['exp_002']).mean()*100:.1f}%)")
print(f"exp_000 vs exp_003: {(all_preds['exp_000'] == all_preds['exp_003']).sum()}/418 ({(all_preds['exp_000'] == all_preds['exp_003']).mean()*100:.1f}%)")

In [None]:
# LB scores for reference
lb_scores = {
    'exp_000': 0.7799,
    'exp_001': 0.7727,
    'exp_002': 0.7703,
    'exp_003': 0.7584
}

print("\nLB Score Ranking:")
for exp, score in sorted(lb_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"  {exp}: {score:.4f}")

print(f"\nBest LB: exp_000 with {lb_scores['exp_000']:.4f}")
print(f"Worst LB: exp_003 with {lb_scores['exp_003']:.4f}")
print(f"Difference: {(lb_scores['exp_000'] - lb_scores['exp_003'])*100:.2f}%")

In [None]:
# Majority vote analysis
all_preds['majority_vote'] = ((all_preds['exp_000'] + all_preds['exp_001'] + all_preds['exp_002'] + all_preds['exp_003']) >= 2).astype(int)

print("\nMajority Vote Analysis (all 4 experiments):")
print(f"Majority vote distribution: {all_preds['majority_vote'].value_counts().to_dict()}")
print(f"\nAgreement with each experiment:")
for exp in ['exp_000', 'exp_001', 'exp_002', 'exp_003']:
    agreement = (all_preds['majority_vote'] == all_preds[exp]).sum()
    print(f"  vs {exp}: {agreement}/418 ({agreement/418*100:.1f}%)")

In [None]:
# Best 3 experiments (excluding worst exp_003)
all_preds['majority_top3'] = ((all_preds['exp_000'] + all_preds['exp_001'] + all_preds['exp_002']) >= 2).astype(int)

print("\nMajority Vote (Top 3 - excluding exp_003):")
print(f"Distribution: {all_preds['majority_top3'].value_counts().to_dict()}")
print(f"\nAgreement with exp_000 (best LB): {(all_preds['majority_top3'] == all_preds['exp_000']).sum()}/418 ({(all_preds['majority_top3'] == all_preds['exp_000']).mean()*100:.1f}%)")

In [None]:
# Weighted voting based on LB performance
# Weight by LB score (higher is better)
weights = {
    'exp_000': 0.7799,
    'exp_001': 0.7727,
    'exp_002': 0.7703,
    'exp_003': 0.7584
}

# Normalize weights
total_weight = sum(weights.values())
norm_weights = {k: v/total_weight for k, v in weights.items()}

print("\nWeighted Voting (by LB score):")
print(f"Normalized weights: {norm_weights}")

weighted_sum = (all_preds['exp_000'] * norm_weights['exp_000'] + 
                all_preds['exp_001'] * norm_weights['exp_001'] + 
                all_preds['exp_002'] * norm_weights['exp_002'] + 
                all_preds['exp_003'] * norm_weights['exp_003'])

all_preds['weighted_vote'] = (weighted_sum >= 0.5).astype(int)
print(f"\nWeighted vote distribution: {all_preds['weighted_vote'].value_counts().to_dict()}")
print(f"Agreement with exp_000: {(all_preds['weighted_vote'] == all_preds['exp_000']).sum()}/418")

In [None]:
# Best strategy: Trust exp_000 (best LB) but consider where other models agree
# If exp_000, exp_001, exp_002 all agree, that's likely correct
all_preds['top3_unanimous'] = ((all_preds['exp_000'] == all_preds['exp_001']) & 
                               (all_preds['exp_001'] == all_preds['exp_002']))

print("\nUnanimous Agreement Analysis (top 3 experiments):")
print(f"Top 3 unanimous: {all_preds['top3_unanimous'].sum()}/418 ({all_preds['top3_unanimous'].mean()*100:.1f}%)")

# Where top 3 disagree
disagree_mask = ~all_preds['top3_unanimous']
print(f"\nDisagreements: {disagree_mask.sum()} cases")

In [None]:
# Final analysis: What should we submit?
print("="*60)
print("FINAL SUBMISSION STRATEGY ANALYSIS")
print("="*60)

print("\n1. BEST OPTION: Submit exp_000 again (already submitted, LB 0.7799)")
print("   - This is our best LB score")
print("   - But we already submitted it, so no new information")

print("\n2. RISKY OPTION: Submit majority vote of top 3")
print(f"   - Differs from exp_000 by {(all_preds['majority_top3'] != all_preds['exp_000']).sum()} predictions")
print("   - Could be better or worse")

print("\n3. SAFEST NEW OPTION: Submit weighted ensemble")
print(f"   - Differs from exp_000 by {(all_preds['weighted_vote'] != all_preds['exp_000']).sum()} predictions")
print("   - Weights favor exp_000 (best LB)")

print("\n" + "="*60)
print("RECOMMENDATION: Given only 1 submission left,")
print("the safest approach is to submit exp_000 (best LB)")
print("OR a conservative ensemble that stays close to exp_000.")
print("="*60)

In [None]:
# Create a conservative ensemble: exp_000 + exp_001 majority (best 2 LB scores)
all_preds['top2_majority'] = ((all_preds['exp_000'] + all_preds['exp_001']) >= 1).astype(int)  # OR logic
all_preds['top2_both'] = ((all_preds['exp_000'] + all_preds['exp_001']) == 2).astype(int)  # AND logic

print("\nTop 2 Ensemble Options (exp_000 + exp_001):")
print(f"OR logic (predict 1 if either predicts 1): {all_preds['top2_majority'].value_counts().to_dict()}")
print(f"AND logic (predict 1 only if both predict 1): {all_preds['top2_both'].value_counts().to_dict()}")
print(f"\nexp_000 distribution: {all_preds['exp_000'].value_counts().to_dict()}")

print(f"\nOR differs from exp_000 by: {(all_preds['top2_majority'] != all_preds['exp_000']).sum()}")
print(f"AND differs from exp_000 by: {(all_preds['top2_both'] != all_preds['exp_000']).sum()}")