# Loop 2 Strategic Analysis

## Key Questions:
1. Can we validate the simpler model hypothesis by submitting exp_001?
2. What's the realistic ceiling for this competition?
3. What ensemble approaches should we try?

In [None]:
import pandas as pd
import numpy as np

# Load data to understand distributions
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train: {len(train)} samples")
print(f"Test: {len(test)} samples")
print(f"\nTrain survival rate: {train['Survived'].mean():.3f}")

In [None]:
# Analyze the submissions we have
print("="*60)
print("SUBMISSION ANALYSIS")
print("="*60)

# Load both candidate submissions
candidate_000 = pd.read_csv('/home/code/submission_candidates/candidate_000.csv')
candidate_001 = pd.read_csv('/home/code/submission_candidates/candidate_001.csv')

print(f"\nCandidate 000 (XGBoost baseline):")
print(f"  Survival rate: {candidate_000['Survived'].mean():.3f}")
print(f"  CV: 0.8316, LB: 0.7584")

print(f"\nCandidate 001 (Simple RF):")
print(f"  Survival rate: {candidate_001['Survived'].mean():.3f}")
print(f"  CV: 0.8238, LB: NOT SUBMITTED YET")

In [None]:
# Compare predictions between the two models
merged = pd.merge(candidate_000, candidate_001, on='PassengerId', suffixes=('_xgb', '_rf'))
merged['agree'] = merged['Survived_xgb'] == merged['Survived_rf']

print(f"\nPrediction Agreement:")
print(f"  Same prediction: {merged['agree'].sum()} / {len(merged)} ({merged['agree'].mean()*100:.1f}%)")
print(f"  Different predictions: {(~merged['agree']).sum()}")

# Where do they disagree?
disagree = merged[~merged['agree']]
print(f"\nDisagreement breakdown:")
print(f"  XGBoost=1, RF=0: {((disagree['Survived_xgb']==1) & (disagree['Survived_rf']==0)).sum()}")
print(f"  XGBoost=0, RF=1: {((disagree['Survived_xgb']==0) & (disagree['Survived_rf']==1)).sum()}")

In [None]:
# The simpler model predicts fewer survivors (31.3% vs 37.6%)
# This is concerning - let's understand why

print("="*60)
print("SURVIVAL RATE ANALYSIS")
print("="*60)
print(f"\nTraining data survival rate: {train['Survived'].mean():.3f} (38.4%)")
print(f"XGBoost prediction rate: {candidate_000['Survived'].mean():.3f} (37.6%)")
print(f"Simple RF prediction rate: {candidate_001['Survived'].mean():.3f} (31.3%)")

print(f"\nThe simple RF is predicting 6% fewer survivors than training rate.")
print(f"This could indicate:")
print(f"  1. Model is too conservative (underfitting)")
print(f"  2. Test set has different distribution")
print(f"  3. Missing features that identify survivors")

In [None]:
# Key strategic decision: Should we submit exp_001?
# 
# Arguments FOR submitting:
# - Validates the "simpler model = smaller gap" hypothesis
# - Uses 1 of 7 remaining submissions (still have plenty)
# - Gives us calibration data
#
# Arguments AGAINST:
# - Lower CV (0.8238 vs 0.8316) might mean lower LB too
# - Survival rate is very low (31.3%) - might be underfitting
#
# DECISION: Submit to validate hypothesis, then pivot based on result

print("="*60)
print("STRATEGIC DECISION")
print("="*60)
print("\nRecommendation: Submit exp_001 to validate hypothesis")
print("\nExpected outcomes:")
print("  If LB > 0.77: Simpler model works, continue this direction")
print("  If LB ~ 0.76: Similar to baseline, overfitting not the only issue")
print("  If LB < 0.75: Simpler model is worse, need different approach")

In [None]:
# What should we try next regardless of submission result?
# 
# 1. Voting Ensemble - proven to work (0.808 LB in kernels)
# 2. Add Title feature back - captures sex + social status
# 3. Stacking approach - more sophisticated ensemble

print("="*60)
print("NEXT EXPERIMENTS TO TRY")
print("="*60)
print("\n1. VOTING ENSEMBLE (High Priority)")
print("   - Combine RF, LogisticRegression, GradientBoosting, SVC")
print("   - Use simple 7-feature set")
print("   - Soft voting for probability averaging")
print("   - Expected: More robust predictions")

print("\n2. ADD TITLE FEATURE (Medium Priority)")
print("   - Title captures sex + social status")
print("   - Mr (adult male) vs Master (young boy)")
print("   - Mrs vs Miss (married vs unmarried)")
print("   - Keep other features simple")

print("\n3. STACKING (After ensemble baseline)")
print("   - Level 1: 5 diverse base models")
print("   - Level 2: XGBoost or LogisticRegression")
print("   - Use out-of-fold predictions to avoid leakage")

In [None]:
# Reality check on target
print("="*60)
print("TARGET REALITY CHECK")
print("="*60)
print("\nTarget: 1.0 (100% accuracy)")
print("State-of-the-art: 81-85% accuracy")
print("\nThis target is IMPOSSIBLE to achieve.")
print("\nRealistic goals:")
print("  - Beat 0.7584 (current LB): Achievable")
print("  - Reach 0.78-0.80: Good progress")
print("  - Reach 0.81-0.82: Excellent (top 10%)")
print("  - Reach 0.83-0.85: State-of-the-art")
print("  - Reach 1.0: Impossible")

print("\nWe should focus on maximizing LB score within realistic bounds.")

In [None]:
# Summary of findings
print("="*60)
print("LOOP 2 ANALYSIS SUMMARY")
print("="*60)
print("\n1. Current best: CV 0.8316, LB 0.7584 (7.3% gap)")
print("2. Simple RF: CV 0.8238, LB unknown (needs submission)")
print("3. Target of 1.0 is impossible - max achievable is ~0.85")
print("4. Next steps:")
print("   a) Submit exp_001 to validate simpler model hypothesis")
print("   b) Implement voting ensemble with diverse models")
print("   c) Consider adding Title feature back")
print("\n5. Key insight: Ensemble methods consistently achieve 0.80+ LB")