# Experiment 002: Reduce Overfitting + Ticket Features

Addressing the CV-LB gap (0.8361 CV vs 0.7799 LB = 5.6% gap):
1. Add Ticket_Group_Size feature
2. Use simpler/regularized models
3. Remove potentially overfitting features (AgeBin, FareBin)
4. Try ensemble of regularized models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
# Combine train and test for consistent feature engineering
train['is_train'] = 1
test['is_train'] = 0
test['Survived'] = np.nan
df = pd.concat([train, test], axis=0, ignore_index=True)

print(f"Combined shape: {df.shape}")

In [None]:
# Feature Engineering

# 1. Extract Title from Name
import re
def extract_title(name):
    match = re.search(r' ([A-Za-z]+)\.', name)
    if match:
        return match.group(1)
    return 'Unknown'

df['Title'] = df['Name'].apply(extract_title)

# Group rare titles
title_mapping = {
    'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare', 'Capt': 'Rare',
    'Lady': 'Rare', 'Countess': 'Rare', 'Sir': 'Rare', 'Don': 'Rare', 'Dona': 'Rare',
    'Jonkheer': 'Rare'
}
df['Title'] = df['Title'].map(lambda x: title_mapping.get(x, 'Rare'))

print("Title distribution:")
print(df['Title'].value_counts())

In [None]:
# 2. Family Features
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

# 3. Cabin Features
df['Has_Cabin'] = df['Cabin'].notna().astype(int)

# 4. NEW: Ticket Group Size - passengers sharing same ticket
ticket_counts = df.groupby('Ticket')['PassengerId'].transform('count')
df['Ticket_Group_Size'] = ticket_counts

print("Ticket_Group_Size distribution:")
print(df['Ticket_Group_Size'].value_counts().sort_index())

In [None]:
# 5. FarePerPerson - divide fare by ticket group size
df['FarePerPerson'] = df['Fare'] / df['Ticket_Group_Size']

# 6. Age Imputation - using median by (Sex, Pclass, Title)
train_mask = df['is_train'] == 1
age_medians = df[train_mask].groupby(['Sex', 'Pclass', 'Title'])['Age'].median()

def impute_age(row):
    if pd.isna(row['Age']):
        try:
            return age_medians.loc[(row['Sex'], row['Pclass'], row['Title'])]
        except KeyError:
            try:
                return df[train_mask].groupby(['Sex', 'Pclass'])['Age'].median().loc[(row['Sex'], row['Pclass'])]
            except KeyError:
                return df[train_mask]['Age'].median()
    return row['Age']

df['Age'] = df.apply(impute_age, axis=1)
print(f"Missing Age after imputation: {df['Age'].isna().sum()}")

In [None]:
# 7. Fare Handling
fare_medians = df[train_mask].groupby('Pclass')['Fare'].median()

def impute_fare(row):
    if pd.isna(row['Fare']):
        return fare_medians.loc[row['Pclass']]
    return row['Fare']

df['Fare'] = df.apply(impute_fare, axis=1)
df['FarePerPerson'] = df['Fare'] / df['Ticket_Group_Size']  # Recalculate after imputation

# 8. Embarked - fill with mode
df['Embarked'] = df['Embarked'].fillna('S')

# Encode categorical features
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
df['Title'] = df['Title'].map({'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5})

print("Feature encoding complete")

In [None]:
# Select SIMPLER features - removing AgeBin and FareBin to reduce overfitting
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 
            'Title', 'FamilySize', 'IsAlone', 'Has_Cabin',
            'Ticket_Group_Size', 'FarePerPerson']

# Split back to train and test
train_df = df[df['is_train'] == 1].copy()
test_df = df[df['is_train'] == 0].copy()

X = train_df[features].values
y = train_df['Survived'].values
X_test = test_df[features].values
test_ids = test_df['PassengerId'].values

print(f"X shape: {X.shape}")
print(f"Features: {features}")

In [None]:
# Test 1: Logistic Regression (simple, regularized)
print("=" * 50)
print("Model 1: Logistic Regression (L2 regularized)")
print("=" * 50)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_scores_lr = []
test_preds_lr = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Scale features for LR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression(C=0.5, max_iter=1000, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    val_pred = model.predict(X_val_scaled)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores_lr.append(fold_acc)
    test_preds_lr += model.predict_proba(X_test_scaled)[:, 1] / 5
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

mean_cv_lr = np.mean(fold_scores_lr)
std_cv_lr = np.std(fold_scores_lr)
print(f"\nLogistic Regression CV: {mean_cv_lr:.4f} ± {std_cv_lr:.4f}")

In [None]:
# Test 2: Shallow Random Forest (max_depth=4)
print("\n" + "=" * 50)
print("Model 2: Shallow Random Forest (max_depth=4)")
print("=" * 50)

fold_scores_rf = []
test_preds_rf = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=4,  # Shallow!
        min_samples_split=10,
        min_samples_leaf=4,
        criterion='entropy',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    
    val_pred = model.predict(X_val)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores_rf.append(fold_acc)
    test_preds_rf += model.predict_proba(X_test)[:, 1] / 5
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

mean_cv_rf = np.mean(fold_scores_rf)
std_cv_rf = np.std(fold_scores_rf)
print(f"\nShallow RF CV: {mean_cv_rf:.4f} ± {std_cv_rf:.4f}")

In [None]:
# Test 3: Gradient Boosting with regularization
print("\n" + "=" * 50)
print("Model 3: Gradient Boosting (regularized)")
print("=" * 50)

fold_scores_gb = []
test_preds_gb = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=3,  # Shallow
        min_samples_split=10,
        min_samples_leaf=4,
        subsample=0.8,
        random_state=42
    )
    model.fit(X_train, y_train)
    
    val_pred = model.predict(X_val)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores_gb.append(fold_acc)
    test_preds_gb += model.predict_proba(X_test)[:, 1] / 5
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

mean_cv_gb = np.mean(fold_scores_gb)
std_cv_gb = np.std(fold_scores_gb)
print(f"\nGradient Boosting CV: {mean_cv_gb:.4f} ± {std_cv_gb:.4f}")

In [None]:
# Test 4: SVC with RBF kernel
print("\n" + "=" * 50)
print("Model 4: SVC (RBF kernel)")
print("=" * 50)

fold_scores_svc = []
test_preds_svc = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    model = SVC(C=1.0, kernel='rbf', probability=True, random_state=42)
    model.fit(X_train_scaled, y_train)
    
    val_pred = model.predict(X_val_scaled)
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores_svc.append(fold_acc)
    test_preds_svc += model.predict_proba(X_test_scaled)[:, 1] / 5
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

mean_cv_svc = np.mean(fold_scores_svc)
std_cv_svc = np.std(fold_scores_svc)
print(f"\nSVC CV: {mean_cv_svc:.4f} ± {std_cv_svc:.4f}")

In [None]:
# Summary of individual models
print("\n" + "=" * 50)
print("SUMMARY OF INDIVIDUAL MODELS")
print("=" * 50)
print(f"Logistic Regression: {mean_cv_lr:.4f} ± {std_cv_lr:.4f}")
print(f"Shallow RF:          {mean_cv_rf:.4f} ± {std_cv_rf:.4f}")
print(f"Gradient Boosting:   {mean_cv_gb:.4f} ± {std_cv_gb:.4f}")
print(f"SVC:                 {mean_cv_svc:.4f} ± {std_cv_svc:.4f}")
print(f"\nBaseline RF (exp_001): 0.8361 ± 0.0069")

In [None]:
# Ensemble: Average predictions from all models
print("\n" + "=" * 50)
print("ENSEMBLE: Soft Voting (Average Probabilities)")
print("=" * 50)

# Average all model predictions
ensemble_preds = (test_preds_lr + test_preds_rf + test_preds_gb + test_preds_svc) / 4
ensemble_binary = (ensemble_preds >= 0.5).astype(int)

# For CV score of ensemble, we need to do proper OOF
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_ensemble = np.zeros(len(X))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Scale for LR and SVC
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train all models
    lr = LogisticRegression(C=0.5, max_iter=1000, random_state=42)
    lr.fit(X_train_scaled, y_train)
    
    rf = RandomForestClassifier(n_estimators=200, max_depth=4, min_samples_split=10,
                                 min_samples_leaf=4, criterion='entropy', random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=3,
                                     min_samples_split=10, min_samples_leaf=4, subsample=0.8, random_state=42)
    gb.fit(X_train, y_train)
    
    svc = SVC(C=1.0, kernel='rbf', probability=True, random_state=42)
    svc.fit(X_train_scaled, y_train)
    
    # Average predictions
    val_pred_proba = (lr.predict_proba(X_val_scaled)[:, 1] + 
                      rf.predict_proba(X_val)[:, 1] + 
                      gb.predict_proba(X_val)[:, 1] + 
                      svc.predict_proba(X_val_scaled)[:, 1]) / 4
    oof_ensemble[val_idx] = val_pred_proba

ensemble_oof_binary = (oof_ensemble >= 0.5).astype(int)
ensemble_cv = accuracy_score(y, ensemble_oof_binary)
print(f"\nEnsemble CV Accuracy: {ensemble_cv:.4f}")

In [None]:
# Create submission with ensemble
submission = pd.DataFrame({
    'PassengerId': test_ids.astype(int),
    'Survived': ensemble_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(f"Predicted survival rate: {ensemble_binary.mean():.3f}")
print(submission.head(10))

In [None]:
# Final summary
print("\n" + "=" * 50)
print("FINAL SUMMARY")
print("=" * 50)
print(f"Previous baseline (exp_001): CV = 0.8361, LB = 0.7799")
print(f"This experiment ensemble:    CV = {ensemble_cv:.4f}")
print(f"\nChanges made:")
print("- Added Ticket_Group_Size and FarePerPerson features")
print("- Removed AgeBin and FareBin (potential overfitting)")
print("- Used simpler/regularized models")
print("- Ensemble of LR, shallow RF, GB, SVC")
print(f"\nExpected: Lower CV but hopefully better LB correlation")