# Loop 2 Analysis: Strategic Assessment

## Key Questions:
1. What is the realistic target for Titanic? (Target of 1.0 is impossible)
2. Should we submit exp_001 to validate fixed leakage?
3. What features/approaches haven't been tried yet?

In [None]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')
print(f'\nTarget distribution:')
print(train['Survived'].value_counts(normalize=True))

In [None]:
# Analyze what we've tried vs what's left

print('='*60)
print('EXPERIMENTS SUMMARY')
print('='*60)
print()
print('exp_000: Baseline RF')
print('  - CV: 0.8339, LB: 0.7799')
print('  - Features: Title, FamilySize, IsAlone, Has_Cabin')
print('  - Issue: Data leakage in Age imputation')
print()
print('exp_001: Stacking Ensemble')
print('  - CV: 0.8271 (lower due to fixed leakage)')
print('  - LB: NOT SUBMITTED YET')
print('  - Features: Added Name_Length, Ticket_Frequency, Sex_Pclass, Age_Bin, IsChild, FamilySize_Cat')
print('  - Fixed: Age imputation from training data only')
print('  - Key insight: SVC alone (83.6%) beat stacking (82.7%)')
print()
print('CV-LB Gap: 5.4% (83.4% -> 78.0%)')
print('Adversarial Validation: Age is primary source of shift (56.9%)')

In [None]:
# Features NOT yet explored (from strategy)
print('='*60)
print('FEATURES NOT YET EXPLORED')
print('='*60)
print()
print('1. Deck extraction from Cabin (first character)')
print('   - Can indicate location on ship')
print('   - Proximity to lifeboats')
print()
print('2. Fare binning (similar to Age binning)')
print('   - Fare has 14.3% importance in adversarial validation')
print('   - Binning may reduce distribution shift')
print()
print('3. Ticket prefix extraction')
print('   - Some tickets have prefixes like "PC", "STON/O2"')
print('   - May indicate class or booking type')
print()
print('4. More sophisticated Age handling')
print('   - Currently using Age_Bin + raw Age (redundant?)')
print('   - Try ONLY Age_Bin to fully address distribution shift')

In [None]:
# Analyze Cabin/Deck feature
print('='*60)
print('CABIN/DECK ANALYSIS')
print('='*60)

# Extract deck from Cabin
train['Deck'] = train['Cabin'].str[0]
test['Deck'] = test['Cabin'].str[0]

print('\nDeck distribution in train:')
print(train['Deck'].value_counts())

print('\nDeck survival rates:')
deck_survival = train.groupby('Deck')['Survived'].agg(['mean', 'count'])
print(deck_survival.sort_values('mean', ascending=False))

In [None]:
# Analyze Fare distribution shift
print('='*60)
print('FARE DISTRIBUTION ANALYSIS')
print('='*60)

print('\nTrain Fare statistics:')
print(train['Fare'].describe())

print('\nTest Fare statistics:')
print(test['Fare'].describe())

# Compare distributions
print('\nFare quartiles comparison:')
print(f"Train Q1: {train['Fare'].quantile(0.25):.2f}, Test Q1: {test['Fare'].quantile(0.25):.2f}")
print(f"Train Q2: {train['Fare'].quantile(0.50):.2f}, Test Q2: {test['Fare'].quantile(0.50):.2f}")
print(f"Train Q3: {train['Fare'].quantile(0.75):.2f}, Test Q3: {test['Fare'].quantile(0.75):.2f}")

In [None]:
# Analyze Ticket prefix
print('='*60)
print('TICKET PREFIX ANALYSIS')
print('='*60)

def extract_ticket_prefix(ticket):
    parts = ticket.split()
    if len(parts) > 1:
        return parts[0]
    else:
        return 'NONE'

train['Ticket_Prefix'] = train['Ticket'].apply(extract_ticket_prefix)
test['Ticket_Prefix'] = test['Ticket'].apply(extract_ticket_prefix)

print('\nTop 10 ticket prefixes in train:')
print(train['Ticket_Prefix'].value_counts().head(10))

print('\nTicket prefix survival rates (min 10 samples):')
prefix_survival = train.groupby('Ticket_Prefix')['Survived'].agg(['mean', 'count'])
prefix_survival = prefix_survival[prefix_survival['count'] >= 10]
print(prefix_survival.sort_values('mean', ascending=False))

In [None]:
# Strategic recommendations
print('='*60)
print('STRATEGIC RECOMMENDATIONS')
print('='*60)
print()
print('IMMEDIATE PRIORITY:')
print('1. Submit exp_001 to validate if fixed leakage helps LB')
print('   - Key hypothesis: lower CV with fixed leakage may generalize better')
print('   - Current CV-LB gap is 5.4% - need to see if it narrows')
print()
print('NEXT EXPERIMENTS:')
print('2. Try SVC alone (83.6% CV) instead of stacking')
print('   - Simpler model may generalize better')
print('   - Stacking added noise, not signal')
print()
print('3. Feature engineering to address distribution shift:')
print('   - Drop raw Age, use ONLY Age_Bin')
print('   - Add Fare_Bin to reduce Fare sensitivity')
print('   - Add Deck feature from Cabin')
print()
print('4. Try XGBoost with careful regularization')
print('   - Mentioned in stacking kernel as meta-learner')
print('   - May handle distribution shift better with regularization')
print()
print('TARGET REALITY CHECK:')
print('- Target of 1.0 (100%) is IMPOSSIBLE for Titanic')
print('- Best realistic LB scores are 80-82%')
print('- Current best LB: 77.99%')
print('- Goal: Get to 79-80% range')