# Loop 5 Analysis: CV Instability and Overfitting Investigation

## Key Questions:
1. Why did same hyperparameters give different CV scores (0.81951 vs 0.81617)?
2. Is threshold tuning overfitting to OOF predictions?
3. What approaches might reduce CV-LB gap?

## Evaluator Concerns to Address:
- CV score discrepancy between experiments
- Threshold 0.47 shifts predicted distribution significantly
- CV-LB gap is increasing (0.97% → 1.50%)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')
print(f"Train: {train.shape}, Test: {test.shape}")
print(f"Training transported rate: {train['Transported'].mean():.4f}")

Train: (8693, 14), Test: (4277, 13)
Training transported rate: 0.5036


In [2]:
# Quick feature engineering (same as experiments)
def feature_engineering(df):
    df = df.copy()
    df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
    df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if pd.notna(x) else 'Unknown')
    df['CabinNum'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if pd.notna(x) else np.nan)
    df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if pd.notna(x) else 'Unknown')
    return df

train = feature_engineering(train)
test = feature_engineering(test)

# GroupSize
all_data = pd.concat([train[['Group']], test[['Group']]], ignore_index=True)
group_sizes = all_data['Group'].value_counts().to_dict()
for df in [train, test]:
    df['GroupSize'] = df['Group'].map(group_sizes)
    df['Solo'] = (df['GroupSize'] == 1).astype(int)

print("Basic features done")

Basic features done


In [3]:
# Imputation
def group_based_imputation(train_df, test_df):
    combined = pd.concat([train_df, test_df], ignore_index=True)
    for col in ['HomePlanet', 'Deck', 'Side']:
        group_mode = combined.groupby('Group')[col].apply(
            lambda x: x.mode()[0] if len(x.mode()) > 0 and x.mode()[0] != 'Unknown' else 'Unknown'
        ).to_dict()
        for df in [train_df, test_df]:
            mask = (df[col].isna()) | (df[col] == 'Unknown')
            df.loc[mask, col] = df.loc[mask, 'Group'].map(group_mode)
    return train_df, test_df

train, test = group_based_imputation(train, test)

def impute_remaining(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        mask = (df['CryoSleep'] == True) & (df[col].isna())
        df.loc[mask, col] = 0
    for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']:
        if df[col].isna().any() or (df[col] == 'Unknown').any():
            mode_val = df[col].replace('Unknown', np.nan).mode()[0]
            df[col] = df[col].replace('Unknown', mode_val)
            df[col] = df[col].fillna(mode_val)
    for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinNum']:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].median())
    return df

train = impute_remaining(train)
test = impute_remaining(test)
print("Imputation done")

Imputation done


In [4]:
# Create features
def create_features(df):
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpent'] = df[spending_cols].sum(axis=1)
    for col in spending_cols:
        df[f'{col}_ratio'] = df[col] / (df['TotalSpent'] + 1)
        df[f'{col}_spent'] = (df[col] > 0).astype(int)
        df[f'{col}_log'] = np.log1p(df[col])
    df['LuxurySpent'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['BasicSpent'] = df['FoodCourt'] + df['ShoppingMall']
    df['LuxuryRatio'] = df['LuxurySpent'] / (df['TotalSpent'] + 1)
    df['SpentPerAge'] = df['TotalSpent'] / (df['Age'] + 1)
    df['SpendingBin'] = pd.cut(df['TotalSpent'], bins=[-1, 0, 500, 2000, float('inf')], labels=[0, 1, 2, 3]).astype(int)
    df['NumSpendingCategories'] = sum(df[f'{col}_spent'] for col in spending_cols)
    df['TotalSpent_log'] = np.log1p(df['TotalSpent'])
    df['LuxurySpent_log'] = np.log1p(df['LuxurySpent'])
    df['BasicSpent_log'] = np.log1p(df['BasicSpent'])
    df['Spa_VRDeck_RoomService'] = df['Spa'] + df['VRDeck'] + df['RoomService']
    df['FoodCourt_RoomService'] = df['FoodCourt'] + df['RoomService']
    df['IsChild'] = (df['Age'] <= 12).astype(int)
    df['IsTeen'] = ((df['Age'] > 12) & (df['Age'] <= 17)).astype(int)
    df['IsYoungAdult'] = ((df['Age'] > 17) & (df['Age'] <= 25)).astype(int)
    df['IsAdult'] = ((df['Age'] > 25) & (df['Age'] <= 60)).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)
    return df

train = create_features(train)
test = create_features(test)
print("Features created")

Features created


In [5]:
# Prepare features with label encoding
from sklearn.preprocessing import LabelEncoder

cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
num_features = [
    'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'Group', 'PassengerNum', 'CabinNum', 'GroupSize', 'Solo',
    'TotalSpent', 'LuxurySpent', 'BasicSpent', 'LuxuryRatio', 'SpentPerAge', 'SpendingBin',
    'NumSpendingCategories', 'Spa_VRDeck_RoomService', 'FoodCourt_RoomService',
    'RoomService_ratio', 'FoodCourt_ratio', 'ShoppingMall_ratio', 'Spa_ratio', 'VRDeck_ratio',
    'RoomService_spent', 'FoodCourt_spent', 'ShoppingMall_spent', 'Spa_spent', 'VRDeck_spent',
    'RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log',
    'TotalSpent_log', 'LuxurySpent_log', 'BasicSpent_log',
    'IsChild', 'IsTeen', 'IsYoungAdult', 'IsAdult', 'IsSenior'
]

# Label encode categoricals
for col in cat_features:
    le = LabelEncoder()
    combined = pd.concat([train[col].astype(str), test[col].astype(str)])
    le.fit(combined)
    train[col + '_enc'] = le.transform(train[col].astype(str))
    test[col + '_enc'] = le.transform(test[col].astype(str))

feature_cols = [col + '_enc' for col in cat_features] + num_features
X = train[feature_cols].values
y = train['Transported'].astype(int).values
X_test = test[feature_cols].values

print(f"Features: {len(feature_cols)}")

Features: 49


In [None]:
# INVESTIGATION 1: CV Stability with Multiple Seeds
# Run same model with different random seeds to understand variance

print("=== CV STABILITY ANALYSIS ===")
print("Testing same hyperparameters with different random seeds...\n")

cat_params = {
    'depth': 8,
    'learning_rate': 0.051,
    'iterations': 755,
    'l2_leaf_reg': 3.52,
    'verbose': False
}

seeds = [42, 123, 456, 789, 1000]
cv_scores_by_seed = []

for seed in seeds:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof_preds = np.zeros(len(X))
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = CatBoostClassifier(**cat_params, random_seed=seed)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        fold_acc = accuracy_score(y_val, (oof_preds[val_idx] >= 0.5).astype(int))
        fold_scores.append(fold_acc)
    
    cv_score = accuracy_score(y, (oof_preds >= 0.5).astype(int))
    cv_scores_by_seed.append(cv_score)
    print(f"Seed {seed}: CV = {cv_score:.5f} (fold std: {np.std(fold_scores):.5f})")

print(f"\nMean CV across seeds: {np.mean(cv_scores_by_seed):.5f}")
print(f"Std CV across seeds: {np.std(cv_scores_by_seed):.5f}")
print(f"Range: {min(cv_scores_by_seed):.5f} - {max(cv_scores_by_seed):.5f}")

In [None]:
# INVESTIGATION 2: Threshold Analysis
# Is threshold 0.47 really better, or is it overfitting?

print("\n=== THRESHOLD ANALYSIS ===")
print("Analyzing threshold sensitivity across different CV splits...\n")

# Get OOF predictions with seed 42 (our standard)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = CatBoostClassifier(**cat_params, random_seed=42)
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]

# Analyze threshold impact
thresholds = np.arange(0.45, 0.55, 0.01)
print("Threshold | Accuracy | Pred Rate | Delta from 0.5")
print("-" * 55)

for t in thresholds:
    acc = accuracy_score(y, (oof_preds >= t).astype(int))
    pred_rate = (oof_preds >= t).mean()
    delta = acc - accuracy_score(y, (oof_preds >= 0.5).astype(int))
    print(f"  {t:.2f}    |  {acc:.5f}  |  {pred_rate:.4f}   |  {delta:+.5f}")

print(f"\nTraining transported rate: {y.mean():.4f}")

In [None]:
# INVESTIGATION 3: Feature Importance and Potential Overfitting Features
# Check if some features might be causing overfitting

print("\n=== FEATURE IMPORTANCE ANALYSIS ===")

# Train final model
model = CatBoostClassifier(**cat_params, random_seed=42)
model.fit(X, y, verbose=False)

# Get feature importances
importances = model.feature_importances_
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances
}).sort_values('importance', ascending=False)

print("Top 15 features:")
print(feature_importance.head(15).to_string(index=False))

print("\nBottom 10 features (candidates for removal):")
print(feature_importance.tail(10).to_string(index=False))

# Count features with very low importance
low_importance = feature_importance[feature_importance['importance'] < 1.0]
print(f"\nFeatures with importance < 1.0: {len(low_importance)}")

In [None]:
# INVESTIGATION 4: CV-LB Gap Analysis
# What's causing the gap to widen?

print("\n=== CV-LB GAP ANALYSIS ===")
print("\nSubmission History:")
print("| Exp | CV Score | LB Score | Gap | Gap % |")
print("|-----|----------|----------|-----|-------|")
print("| exp_000 | 0.80674 | 0.79705 | +0.00969 | +1.20% |")
print("| exp_003 | 0.81951 | 0.80453 | +0.01498 | +1.83% |")

print("\nObservations:")
print("1. Gap increased from 1.20% to 1.83% as CV improved")
print("2. CV improvement: +0.01277 (+1.58%)")
print("3. LB improvement: +0.00748 (+0.94%)")
print("4. LB improvement rate: 58.6% of CV improvement")
print("\nThis suggests we're overfitting to CV as we tune more.")

print("\nTo beat top LB of 0.8066:")
print(f"- Using 1.83% gap: Need CV of {0.8066 * 1.0183:.5f}")
print(f"- Using 1.50% gap: Need CV of {0.8066 * 1.015:.5f}")

In [None]:
# INVESTIGATION 5: Regularization Impact
# Try stronger regularization to reduce overfitting

print("\n=== REGULARIZATION ANALYSIS ===")
print("Testing different l2_leaf_reg values...\n")

l2_values = [1.0, 3.52, 5.0, 7.0, 10.0]

for l2 in l2_values:
    params = cat_params.copy()
    params['l2_leaf_reg'] = l2
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_preds = np.zeros(len(X))
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = CatBoostClassifier(**params, random_seed=42)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        fold_scores.append(accuracy_score(y_val, (oof_preds[val_idx] >= 0.5).astype(int)))
    
    cv_score = accuracy_score(y, (oof_preds >= 0.5).astype(int))
    print(f"l2_leaf_reg={l2:.1f}: CV={cv_score:.5f} (std={np.std(fold_scores):.5f})")

In [None]:
# SUMMARY AND RECOMMENDATIONS
print("\n" + "="*60)
print("SUMMARY AND RECOMMENDATIONS")
print("="*60)

print("\n1. CV STABILITY:")
print(f"   - CV varies by ~{np.std(cv_scores_by_seed)*100:.2f}% across random seeds")
print(f"   - This explains the 0.81951 vs 0.81617 discrepancy")
print("   - Recommendation: Use average of multiple seeds for more stable estimates")

print("\n2. THRESHOLD TUNING:")
print("   - Threshold 0.47 gives marginal improvement on CV")
print("   - But shifts predicted distribution significantly (53.8% vs 50.4%)")
print("   - Risk: May hurt LB if test distribution matches training")
print("   - Recommendation: Submit with threshold 0.5 as baseline")

print("\n3. CV-LB GAP:")
print("   - Gap is widening (1.20% → 1.83%)")
print("   - We're overfitting to CV")
print("   - Recommendation: Focus on regularization and simpler models")

print("\n4. NEXT STEPS (Priority Order):")
print("   a) Submit exp_004 (threshold 0.47) to get LB feedback")
print("   b) If LB worse: Revert to threshold 0.5, increase regularization")
print("   c) Try stacking with logistic regression meta-learner")
print("   d) Feature selection to remove low-importance features")