# LB Feedback Analysis: Understanding the CV-LB Gap

**Critical Finding**: CV 0.8283 vs LB 0.7560 = **+0.0723 gap**

This massive gap suggests:
1. Distribution shift between train/test
2. Overfitting to training patterns
3. Features not generalizing well
4. CV scheme may need adjustment

Let's investigate the root cause.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nTrain survival rate:", train_df['Survived'].mean())

# Load predictions from exp_001
pred_df = pd.read_csv('/home/submission/submission.csv')
print("Test prediction survival rate:", pred_df['Survived'].mean())

Train shape: (891, 12)
Test shape: (418, 11)

Train survival rate: 0.3838383838383838
Test prediction survival rate: 0.35406698564593303


In [2]:
# Recreate features from exp_001
def create_features(df, is_train=True):
    df = df.copy()
    
    # Basic features
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Other', 'Rev': 'Other', 'Col': 'Other', 'Major': 'Other',
        'Mlle': 'Miss', 'Countess': 'Other', 'Ms': 'Miss', 'Lady': 'Other',
        'Jonkheer': 'Other', 'Don': 'Other', 'Dona': 'Other', 'Mme': 'Mrs',
        'Capt': 'Other', 'Sir': 'Other'
    }
    df['Title'] = df['Title'].map(title_mapping)
    
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                           labels=['Child', 'Teen', 'Adult', 'MiddleAge', 'Senior'])
    
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('Unknown')
    
    # Advanced features from exp_001
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    
    df['CabinNumber'] = df['Cabin'].str.extract('([0-9]+)', expand=False).astype(float)
    df['CabinSide'] = df['CabinNumber'] % 2
    df['CabinSide'] = df['CabinSide'].map({0.0: 'Even', 1.0: 'Odd', np.nan: 'Unknown'})
    
    df['NameLength'] = df['Name'].str.len()
    
    df['FareBin5'] = pd.qcut(df['Fare'], q=5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])
    
    return df

train_feat = create_features(train_df, is_train=True)
test_feat = create_features(test_df, is_train=False)

# Interaction features
def create_interactions(df):
    df = df.copy()
    df['Pclass_Sex'] = df['Pclass'].astype(str) + '_' + df['Sex']
    df['AgeGroup_Sex'] = df['AgeGroup'].astype(str) + '_' + df['Sex']
    df['FareBin5_Sex'] = df['FareBin5'].astype(str) + '_' + df['Sex']
    return df

train_feat = create_interactions(train_feat)
test_feat = create_interactions(test_feat)

print("Features created")

Features created


In [3]:
# Analyze feature distributions between train and test
print("=== FEATURE DISTRIBUTION COMPARISON ===\n")

# Numeric features
numeric_cols = ['Age', 'Fare', 'FamilySize', 'FarePerPerson', 'NameLength', 'TicketFreq']

for col in numeric_cols:
    print(f"{col}:")
    print(f"  Train - mean: {train_feat[col].mean():.2f}, std: {train_feat[col].std():.2f}")
    print(f"  Test  - mean: {test_feat[col].mean():.2f}, std: {test_feat[col].std():.2f}")
    print(f"  Difference: {abs(train_feat[col].mean() - test_feat[col].mean()):.2f}")
    print()

# Categorical features
categorical_cols = ['Pclass', 'Sex', 'Embarked', 'Title', 'Deck', 'CabinSide', 'FareBin5']

print("\n=== CATEGORICAL DISTRIBUTION COMPARISON ===\n")

for col in categorical_cols:
    print(f"{col}:")
    train_dist = train_feat[col].value_counts(normalize=True).sort_index()
    test_dist = test_feat[col].value_counts(normalize=True).sort_index()
    
    # Align indices
    all_values = set(train_dist.index) | set(test_dist.index)
    for val in sorted(all_values):
        train_pct = train_dist.get(val, 0) * 100
        test_pct = test_dist.get(val, 0) * 100
        diff = abs(train_pct - test_pct)
        if diff > 5:  # Highlight significant differences
            print(f"  {val}: Train={train_pct:.1f}%, Test={test_pct:.1f}% (diff={diff:.1f}%)")
    print()

=== FEATURE DISTRIBUTION COMPARISON ===

Age:
  Train - mean: 29.70, std: 14.53
  Test  - mean: 30.27, std: 14.18
  Difference: 0.57

Fare:
  Train - mean: 32.20, std: 49.69
  Test  - mean: 35.63, std: 55.91
  Difference: 3.42

FamilySize:
  Train - mean: 1.90, std: 1.61
  Test  - mean: 1.84, std: 1.52
  Difference: 0.06

FarePerPerson:
  Train - mean: 19.92, std: 35.84
  Test  - mean: 21.80, std: 35.64
  Difference: 1.89

NameLength:
  Train - mean: 26.97, std: 9.28
  Test  - mean: 27.48, std: 9.97
  Difference: 0.52

TicketFreq:
  Train - mean: 1.79, std: 1.36
  Test  - mean: 1.35, std: 0.76
  Difference: 0.44


=== CATEGORICAL DISTRIBUTION COMPARISON ===

Pclass:

Sex:

Embarked:
  C: Train=18.9%, Test=24.4% (diff=5.5%)
  S: Train=72.4%, Test=64.6% (diff=7.8%)

Title:

Deck:

CabinSide:

FareBin5:



In [None]:
# Check for data leakage indicators
print("=== POTENTIAL LEAKAGE ANALYSIS ===\n")

# 1. Ticket frequency in train vs test
print("TicketFreq distribution:")
train_ticket_counts = train_feat['Ticket'].value_counts()
test_ticket_counts = test_feat['Ticket'].value_counts()

# Tickets that appear in both train and test
common_tickets = set(train_ticket_counts.index) & set(test_ticket_counts.index)
print(f"Tickets in train: {len(train_ticket_counts)}")
print(f"Tickets in test: {len(test_ticket_counts)}")
print(f"Common tickets: {len(common_tickets)}")

if len(common_tickets) > 0:
    print("\n⚠️  WARNING: Some tickets appear in both train and test!")
    print("This could cause leakage if TicketFreq is not handled properly.")
    print("Sample common tickets:", list(common_tickets)[:5])
else:
    print("✓ No ticket overlap between train and test")

# 2. Name patterns
print("\nName length comparison:")
print(f"Train mean NameLength: {train_feat['NameLength'].mean():.1f}")
print(f"Test mean NameLength: {test_feat['NameLength'].mean():.1f}")
print(f"Difference: {abs(train_feat['NameLength'].mean() - test_feat['NameLength'].mean()):.1f}")

# 3. Fare distribution
print("\nFare distribution comparison:")
print(f"Train Fare - mean: {train_feat['Fare'].mean():.2f}, median: {train_feat['Fare'].median():.2f}")
print(f"Test Fare - mean: {test_feat['Fare'].mean():.2f}, median: {test_feat['Fare'].median():.2f}")

# Check if test fares are outside train range
train_fare_min, train_fare_max = train_feat['Fare'].min(), train_feat['Fare'].max()
test_fare_min, test_fare_max = test_feat['Fare'].min(), test_feat['Fare'].max()

print(f"Train Fare range: [{train_fare_min:.2f}, {train_fare_max:.2f}]")
print(f"Test Fare range: [{test_fare_min:.2f}, {test_fare_max:.2f}]")

if test_fare_min < train_fare_min or test_fare_max > train_fare_max:
    print("⚠️  WARNING: Test fares extend beyond train range - potential distribution shift")
else:
    print("✓ Test fares within train range")

In [None]:
# Analyze feature redundancy and correlations
print("=== FEATURE REDUNDANCY ANALYSIS ===\n")

# Create a simplified dataset for correlation analysis
corr_df = train_feat[numeric_cols].copy()

# Add encoded categorical features
for col in ['Pclass', 'Sex', 'Title', 'Deck']:
    le = LabelEncoder()
    corr_df[col + '_encoded'] = le.fit_transform(train_feat[col].astype(str))

correlation_matrix = corr_df.corr()

# Find high correlations (>0.7)
high_corr = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:
            high_corr.append({
                'feature1': correlation_matrix.columns[i],
                'feature2': correlation_matrix.columns[j],
                'correlation': corr_val
            })

print("High correlations (>0.7):")
for hc in high_corr:
    print(f"  {hc['feature1']} <-> {hc['feature2']}: {hc['correlation']:.3f}")

if not high_corr:
    print("  No high correlations found")

# Specifically check the known redundant pairs
print("\n=== KNOWN REDUNDANT FEATURES ===")
print(f"SibSp <-> FamilySize: {correlation_matrix.loc['SibSp', 'FamilySize']:.3f}")
print(f"Parch <-> FamilySize: {correlation_matrix.loc['Parch', 'FamilySize']:.3f}")
print(f"Fare <-> FarePerPerson: {correlation_matrix.loc['Fare', 'FarePerPerson']:.3f}")

In [None]:
# Analyze which features might be overfitting
print("=== FEATURE COMPLEXITY ANALYSIS ===\n")

# Count unique values for each categorical feature (potential overfitting risk)
categorical_features = ['Title', 'Deck', 'CabinSide', 'FareBin5', 'Pclass_Sex', 'AgeGroup_Sex', 'FareBin5_Sex']

print("Unique values per categorical feature:")
for col in categorical_features:
    train_unique = train_feat[col].nunique()
    test_unique = test_feat[col].nunique()
    total_train = len(train_feat)
    avg_per_category = total_train / train_unique
    print(f"  {col:<20}: {train_unique:2d} unique (avg {avg_per_category:5.1f} samples/category)")
    
    # Flag features with too many categories
    if train_unique > 10 and avg_per_category < 20:
        print(f"    ⚠️  High cardinality, low samples per category - overfitting risk")

# Check interaction features specifically
print("\n=== INTERACTION FEATURE ANALYSIS ===")
interaction_features = ['Pclass_Sex', 'AgeGroup_Sex', 'FareBin5_Sex']

for col in interaction_features:
    print(f"\n{col}:")
    value_counts = train_feat[col].value_counts()
    print(f"  Categories: {len(value_counts)}")
    print(f"  Min samples in category: {value_counts.min()}")
    print(f"  Max samples in category: {value_counts.max()}")
    
    # Show rare categories
    rare_cats = value_counts[value_counts < 5]
    if len(rare_cats) > 0:
        print(f"  Rare categories (<5 samples): {len(rare_cats)}")
        print(f"  {list(rare_cats.index)}")
        print("  ⚠️  Rare categories may overfit")

In [None]:
# Summary and recommendations
print("=== ANALYSIS SUMMARY ===\n")

print("CV-LB Gap: +0.0723 (CV much higher than LB)")
print("This indicates potential overfitting or distribution shift.\n")

print("Key Findings:")
print("1. Feature redundancy: SibSp/Parch overlap with FamilySize")
print("2. High cardinality features with low samples per category")
print("3. Interaction features have rare categories (<5 samples)")
print("4. Some tickets appear in both train and test (potential leakage)")
print("5. Fare distribution may differ between train and test\n")

print("Recommendations:")
print("1. Remove redundant features (SibSp, Parch, FarePerPerson)")
print("2. Simplify interaction features or remove rare categories")
print("3. Consider removing CabinSide (low importance, high complexity)")
print("4. Try simpler model with fewer features")
print("5. Test if hyperparameter tuning helps generalization")