# Loop 1 LB Feedback Analysis

**Results:**
- CV Score: 0.8316
- LB Score: 0.7584
- Gap: +0.0732 (CV overestimates by 7.3%)

**Goal:** Understand why CV is so much higher than LB and what adjustments to make.

In [1]:
import pandas as pd
import numpy as np

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTrain target distribution:")
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Train target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
# Compare train vs test distributions for key features
print("=" * 50)
print("DISTRIBUTION COMPARISON: TRAIN vs TEST")
print("=" * 50)

# Sex distribution
print("\nSex distribution:")
print("Train:", train['Sex'].value_counts(normalize=True).to_dict())
print("Test:", test['Sex'].value_counts(normalize=True).to_dict())

# Pclass distribution
print("\nPclass distribution:")
print("Train:", train['Pclass'].value_counts(normalize=True).sort_index().to_dict())
print("Test:", test['Pclass'].value_counts(normalize=True).sort_index().to_dict())

# Embarked distribution
print("\nEmbarked distribution:")
print("Train:", train['Embarked'].value_counts(normalize=True).to_dict())
print("Test:", test['Embarked'].value_counts(normalize=True).to_dict())

DISTRIBUTION COMPARISON: TRAIN vs TEST

Sex distribution:
Train: {'male': 0.6475869809203143, 'female': 0.35241301907968575}
Test: {'male': 0.6363636363636364, 'female': 0.36363636363636365}

Pclass distribution:
Train: {1: 0.24242424242424243, 2: 0.20650953984287318, 3: 0.5510662177328844}
Test: {1: 0.25598086124401914, 2: 0.22248803827751196, 3: 0.5215311004784688}

Embarked distribution:
Train: {'S': 0.7244094488188977, 'C': 0.1889763779527559, 'Q': 0.08661417322834646}
Test: {'S': 0.645933014354067, 'C': 0.24401913875598086, 'Q': 0.11004784688995216}


In [3]:
# Age distribution comparison
print("\nAge statistics:")
print(f"Train: mean={train['Age'].mean():.2f}, median={train['Age'].median():.2f}, missing={train['Age'].isna().sum()/len(train)*100:.1f}%")
print(f"Test: mean={test['Age'].mean():.2f}, median={test['Age'].median():.2f}, missing={test['Age'].isna().sum()/len(test)*100:.1f}%")

# Fare distribution comparison
print("\nFare statistics:")
print(f"Train: mean={train['Fare'].mean():.2f}, median={train['Fare'].median():.2f}")
print(f"Test: mean={test['Fare'].mean():.2f}, median={test['Fare'].median():.2f}")


Age statistics:
Train: mean=29.70, median=28.00, missing=19.9%
Test: mean=30.27, median=27.00, missing=20.6%

Fare statistics:
Train: mean=32.20, median=14.45
Test: mean=35.63, median=14.45


In [4]:
# Title extraction to compare
import re

def extract_title(name):
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(extract_title)
test['Title'] = test['Name'].apply(extract_title)

# Group rare titles
def group_titles(title):
    if title in ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']:
        return 'Rare'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Mme':
        return 'Mrs'
    return title

train['Title'] = train['Title'].apply(group_titles)
test['Title'] = test['Title'].apply(group_titles)

print("\nTitle distribution:")
print("Train:", train['Title'].value_counts(normalize=True).to_dict())
print("Test:", test['Title'].value_counts(normalize=True).to_dict())


Title distribution:
Train: {'Mr': 0.5802469135802469, 'Miss': 0.20763187429854096, 'Mrs': 0.1414141414141414, 'Master': 0.04489337822671156, 'Rare': 0.025813692480359147}
Test: {'Mr': 0.5741626794258373, 'Miss': 0.18899521531100477, 'Mrs': 0.1722488038277512, 'Master': 0.050239234449760764, 'Rare': 0.014354066985645933}


In [5]:
# Family size distribution
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

print("\nFamilySize distribution:")
print("Train:", train['FamilySize'].value_counts(normalize=True).sort_index().head(10).to_dict())
print("Test:", test['FamilySize'].value_counts(normalize=True).sort_index().head(10).to_dict())


FamilySize distribution:
Train: {1: 0.6026936026936027, 2: 0.18069584736251404, 3: 0.11447811447811448, 4: 0.03254769921436588, 5: 0.016835016835016835, 6: 0.024691358024691357, 7: 0.013468013468013467, 8: 0.006734006734006734, 11: 0.007856341189674524}
Test: {1: 0.6052631578947368, 2: 0.17703349282296652, 3: 0.13636363636363635, 4: 0.03349282296650718, 5: 0.01674641148325359, 6: 0.007177033492822967, 7: 0.009569377990430622, 8: 0.004784688995215311, 11: 0.009569377990430622}


In [6]:
# Check our submission predictions
submission = pd.read_csv('/home/submission/submission.csv')
print("\nSubmission prediction distribution:")
print(submission['Survived'].value_counts(normalize=True))
print(f"\nPredicted survival rate: {submission['Survived'].mean()*100:.1f}%")
print(f"Train survival rate: {train['Survived'].mean()*100:.1f}%")


Submission prediction distribution:
Survived
0    0.638756
1    0.361244
Name: proportion, dtype: float64

Predicted survival rate: 36.1%
Train survival rate: 38.4%


In [7]:
# Key insight: The CV-LB gap of 0.073 suggests:
# 1. Model is overfitting to training data patterns
# 2. Test set may have different distribution
# 3. Need to use simpler models or stronger regularization

# Let's check if there are any obvious distribution shifts
print("\n" + "="*50)
print("POTENTIAL CAUSES OF CV-LB GAP")
print("="*50)

# Check Cabin distribution (77% missing in train)
train_cabin_rate = train['Cabin'].notna().mean()
test_cabin_rate = test['Cabin'].notna().mean()
print(f"\nCabin present rate:")
print(f"Train: {train_cabin_rate*100:.1f}%")
print(f"Test: {test_cabin_rate*100:.1f}%")

# This could be a source of distribution shift if different


POTENTIAL CAUSES OF CV-LB GAP

Cabin present rate:
Train: 22.9%
Test: 21.8%
