# Loop 1 LB Feedback Analysis

**Results:**
- CV Score: 0.8316
- LB Score: 0.7584
- Gap: +0.0732 (CV overestimates by 7.3%)

**Goal:** Understand why CV is so much higher than LB and what adjustments to make.

In [None]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain target distribution:")
print(train['Survived'].value_counts(normalize=True))

In [None]:
# Compare train vs test distributions for key features
print("=" * 50)
print("DISTRIBUTION COMPARISON: TRAIN vs TEST")
print("=" * 50)

# Sex distribution
print("\nSex distribution:")
print("Train:", train['Sex'].value_counts(normalize=True).to_dict())
print("Test:", test['Sex'].value_counts(normalize=True).to_dict())

# Pclass distribution
print("\nPclass distribution:")
print("Train:", train['Pclass'].value_counts(normalize=True).sort_index().to_dict())
print("Test:", test['Pclass'].value_counts(normalize=True).sort_index().to_dict())

# Embarked distribution
print("\nEmbarked distribution:")
print("Train:", train['Embarked'].value_counts(normalize=True).to_dict())
print("Test:", test['Embarked'].value_counts(normalize=True).to_dict())

In [None]:
# Age distribution comparison
print("\nAge statistics:")
print(f"Train: mean={train['Age'].mean():.2f}, median={train['Age'].median():.2f}, missing={train['Age'].isna().sum()/len(train)*100:.1f}%")
print(f"Test: mean={test['Age'].mean():.2f}, median={test['Age'].median():.2f}, missing={test['Age'].isna().sum()/len(test)*100:.1f}%")

# Fare distribution comparison
print("\nFare statistics:")
print(f"Train: mean={train['Fare'].mean():.2f}, median={train['Fare'].median():.2f}")
print(f"Test: mean={test['Fare'].mean():.2f}, median={test['Fare'].median():.2f}")

In [None]:
# Title extraction to compare
import re

def extract_title(name):
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(extract_title)
test['Title'] = test['Name'].apply(extract_title)

# Group rare titles
def group_titles(title):
    if title in ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']:
        return 'Rare'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Mme':
        return 'Mrs'
    return title

train['Title'] = train['Title'].apply(group_titles)
test['Title'] = test['Title'].apply(group_titles)

print("\nTitle distribution:")
print("Train:", train['Title'].value_counts(normalize=True).to_dict())
print("Test:", test['Title'].value_counts(normalize=True).to_dict())

In [None]:
# Family size distribution
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

print("\nFamilySize distribution:")
print("Train:", train['FamilySize'].value_counts(normalize=True).sort_index().head(10).to_dict())
print("Test:", test['FamilySize'].value_counts(normalize=True).sort_index().head(10).to_dict())

In [None]:
# Check our submission predictions
submission = pd.read_csv('/home/submission/submission.csv')
print("\nSubmission prediction distribution:")
print(submission['Survived'].value_counts(normalize=True))
print(f"\nPredicted survival rate: {submission['Survived'].mean()*100:.1f}%")
print(f"Train survival rate: {train['Survived'].mean()*100:.1f}%")

In [None]:
# Key insight: The CV-LB gap of 0.073 suggests:
# 1. Model is overfitting to training data patterns
# 2. Test set may have different distribution
# 3. Need to use simpler models or stronger regularization

# Let's check if there are any obvious distribution shifts
print("\n" + "="*50)
print("POTENTIAL CAUSES OF CV-LB GAP")
print("="*50)

# Check Cabin distribution (77% missing in train)
train_cabin_rate = train['Cabin'].notna().mean()
test_cabin_rate = test['Cabin'].notna().mean()
print(f"\nCabin present rate:")
print(f"Train: {train_cabin_rate*100:.1f}%")
print(f"Test: {test_cabin_rate*100:.1f}%")

# This could be a source of distribution shift if different