# Experiment 002: Stacking Ensemble with Fixed Data Leakage

Following the evolved strategy:
1. Fix Age imputation to use TRAINING data only (no leakage)
2. Add new features: Ticket_Frequency, Sex_Pclass, Age_Bin, Name_Length
3. Implement stacking with diverse base models
4. Reduce reliance on Age (primary source of distribution shift)

Target: CV ~82%, LB ~79-80%

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier, 
    GradientBoostingClassifier, AdaBoostClassifier,
    StackingClassifier
)
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (891, 12)
Test shape: (418, 11)


In [2]:
def engineer_features(df):
    """Apply feature engineering - same as before"""
    df = df.copy()
    
    # 1. Title extraction (MOST IMPORTANT)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Cabin features
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    # 4. NEW: Name length (mentioned in stacking kernel)
    df['Name_Length'] = df['Name'].apply(len)
    
    return df

# Apply feature engineering
train = engineer_features(train)
test = engineer_features(test)

print("Titles in train:", train['Title'].value_counts().to_dict())

Titles in train: {'Mr': 517, 'Miss': 185, 'Mrs': 126, 'Master': 40, 'Rare': 23}


In [3]:
def fill_missing_values_no_leakage(train_df, test_df):
    """Fill missing values using TRAINING data statistics only (no leakage)"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Age: Compute median by Title from TRAINING data only
    title_age_median = train_df.groupby('Title')['Age'].median()
    train_median_age = train_df['Age'].median()
    
    # Apply to train
    for title in train_df['Title'].unique():
        mask = (train_df['Title'] == title) & (train_df['Age'].isna())
        if title in title_age_median:
            train_df.loc[mask, 'Age'] = title_age_median[title]
        else:
            train_df.loc[mask, 'Age'] = train_median_age
    
    # Apply same values to test (using train statistics)
    for title in test_df['Title'].unique():
        mask = (test_df['Title'] == title) & (test_df['Age'].isna())
        if title in title_age_median:
            test_df.loc[mask, 'Age'] = title_age_median[title]
        else:
            test_df.loc[mask, 'Age'] = train_median_age
    
    # Embarked: Fill with mode from training data
    embarked_mode = train_df['Embarked'].mode()[0]
    train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
    test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)
    
    # Fare: Fill with median by Pclass from training data
    for pclass in [1, 2, 3]:
        fare_median = train_df[train_df['Pclass'] == pclass]['Fare'].median()
        train_df.loc[(train_df['Pclass'] == pclass) & (train_df['Fare'].isna()), 'Fare'] = fare_median
        test_df.loc[(test_df['Pclass'] == pclass) & (test_df['Fare'].isna()), 'Fare'] = fare_median
    
    return train_df, test_df

train, test = fill_missing_values_no_leakage(train, test)

print("Missing values after imputation:")
print("Train:", train[['Age', 'Embarked', 'Fare']].isna().sum().to_dict())
print("Test:", test[['Age', 'Embarked', 'Fare']].isna().sum().to_dict())

Missing values after imputation:
Train: {'Age': 0, 'Embarked': 0, 'Fare': 0}
Test: {'Age': 0, 'Embarked': 0, 'Fare': 0}


In [4]:
def add_new_features(train_df, test_df):
    """Add new features recommended by strategy"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Combine for ticket frequency calculation
    combined = pd.concat([train_df, test_df], sort=False)
    
    # 1. Ticket_Frequency - count of passengers with same ticket
    ticket_counts = combined['Ticket'].value_counts()
    train_df['Ticket_Frequency'] = train_df['Ticket'].map(ticket_counts)
    test_df['Ticket_Frequency'] = test_df['Ticket'].map(ticket_counts)
    
    # 2. Sex_Pclass interaction feature
    train_df['Sex_Pclass'] = train_df['Sex'] + '_' + train_df['Pclass'].astype(str)
    test_df['Sex_Pclass'] = test_df['Sex'] + '_' + test_df['Pclass'].astype(str)
    
    # 3. Age_Bin - broad categories to reduce sensitivity to Age distribution shift
    # Using simple bins: Child (0-16), Adult (16-50), Senior (50+)
    def age_bin(age):
        if age <= 16:
            return 'Child'
        elif age <= 50:
            return 'Adult'
        else:
            return 'Senior'
    
    train_df['Age_Bin'] = train_df['Age'].apply(age_bin)
    test_df['Age_Bin'] = test_df['Age'].apply(age_bin)
    
    # 4. IsChild - binary feature (children had higher survival)
    train_df['IsChild'] = (train_df['Age'] <= 16).astype(int)
    test_df['IsChild'] = (test_df['Age'] <= 16).astype(int)
    
    # 5. FamilySize_Category - non-linear relationship
    def family_category(size):
        if size == 1:
            return 'Alone'
        elif size <= 4:
            return 'Small'
        else:
            return 'Large'
    
    train_df['FamilySize_Cat'] = train_df['FamilySize'].apply(family_category)
    test_df['FamilySize_Cat'] = test_df['FamilySize'].apply(family_category)
    
    return train_df, test_df

train, test = add_new_features(train, test)

print("New features added:")
print("Ticket_Frequency range:", train['Ticket_Frequency'].min(), "-", train['Ticket_Frequency'].max())
print("Sex_Pclass values:", train['Sex_Pclass'].unique())
print("Age_Bin distribution:", train['Age_Bin'].value_counts().to_dict())

New features added:
Ticket_Frequency range: 1 - 11
Sex_Pclass values: ['male_3' 'female_1' 'female_3' 'male_1' 'female_2' 'male_2']
Age_Bin distribution: {'Adult': 723, 'Child': 104, 'Senior': 64}


In [5]:
# Prepare features for modeling
def prepare_features(train_df, test_df):
    """Encode categorical features and select final feature set"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Encode categorical variables
    categorical_cols = ['Sex', 'Embarked', 'Title', 'Sex_Pclass', 'Age_Bin', 'FamilySize_Cat']
    
    for col in categorical_cols:
        le = LabelEncoder()
        combined = pd.concat([train_df[col], test_df[col]])
        le.fit(combined)
        train_df[col] = le.transform(train_df[col])
        test_df[col] = le.transform(test_df[col])
    
    # Select features - reduce Age importance by using Age_Bin instead of raw Age
    # Strategy: Use Age_Bin and IsChild instead of raw Age to reduce distribution shift impact
    features = [
        'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked',
        'Title', 'FamilySize', 'IsAlone', 'Has_Cabin',
        'Name_Length', 'Ticket_Frequency', 'Sex_Pclass',
        'Age_Bin', 'IsChild', 'FamilySize_Cat'
    ]
    
    X_train = train_df[features]
    y_train = train_df['Survived']
    X_test = test_df[features]
    
    return X_train, y_train, X_test, features

X_train, y_train, X_test, features = prepare_features(train, test)

print(f"Features ({len(features)}): {features}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

Features (16): ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 'Name_Length', 'Ticket_Frequency', 'Sex_Pclass', 'Age_Bin', 'IsChild', 'FamilySize_Cat']
X_train shape: (891, 16)
X_test shape: (418, 16)


In [6]:
# Define base models for stacking (diverse algorithms)
base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=100, max_depth=6, min_samples_split=4,
        random_state=42, n_jobs=-1
    )),
    ('et', ExtraTreesClassifier(
        n_estimators=100, max_depth=6, min_samples_split=4,
        random_state=42, n_jobs=-1
    )),
    ('gb', GradientBoostingClassifier(
        n_estimators=100, max_depth=3, learning_rate=0.1,
        random_state=42
    )),
    ('ada', AdaBoostClassifier(
        n_estimators=100, learning_rate=0.1,
        random_state=42
    )),
    ('svc', SVC(
        kernel='rbf', C=1.0, probability=True,
        random_state=42
    ))
]

# Create stacking classifier with LogisticRegression as meta-learner
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5,
    n_jobs=-1
)

print("Stacking classifier created with 5 base models:")
for name, model in base_models:
    print(f"  - {name}: {model.__class__.__name__}")

Stacking classifier created with 5 base models:
  - rf: RandomForestClassifier
  - et: ExtraTreesClassifier
  - gb: GradientBoostingClassifier
  - ada: AdaBoostClassifier
  - svc: SVC


In [7]:
# Scale features for SVC (important for RBF kernel)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate stacking classifier
print("Running 5-fold CV for stacking classifier...")
cv_scores = cross_val_score(stacking_clf, X_train_scaled, y_train, cv=skf, scoring='accuracy')

print(f"\nCV Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.5f} (+/- {cv_scores.std():.5f})")
print(f"Min: {cv_scores.min():.5f}, Max: {cv_scores.max():.5f}")

Running 5-fold CV for stacking classifier...























CV Scores: [0.84357542 0.8258427  0.80898876 0.83146067 0.8258427 ]
Mean CV Accuracy: 0.82714 (+/- 0.01115)
Min: 0.80899, Max: 0.84358


In [8]:
# Also evaluate individual base models for comparison
print("\nIndividual base model CV scores:")
for name, model in base_models:
    scores = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
    print(f"  {name}: {scores.mean():.5f} (+/- {scores.std():.5f})")


Individual base model CV scores:


  rf: 0.82938 (+/- 0.01513)


  et: 0.82940 (+/- 0.00858)


  gb: 0.83048 (+/- 0.02368)


  ada: 0.82154 (+/- 0.00921)


  svc: 0.83613 (+/- 0.01101)


In [9]:
# Train stacking classifier on full training data
print("Training stacking classifier on full training data...")
stacking_clf.fit(X_train_scaled, y_train)

# Make predictions on test set
test_predictions = stacking_clf.predict(X_test_scaled)

# Load original test data for PassengerId
test_original = pd.read_csv('/home/data/test.csv')

# Create submission
submission = pd.DataFrame({
    'PassengerId': test_original['PassengerId'],
    'Survived': test_predictions
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSubmission saved with {len(submission)} predictions")
print(submission.head())
print(f"\nPrediction distribution: {pd.Series(test_predictions).value_counts().to_dict()}")

Training stacking classifier on full training data...







Submission saved with 418 predictions
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1

Prediction distribution: {0: 255, 1: 163}


In [10]:
# Summary
print("="*60)
print("EXPERIMENT 002 SUMMARY")
print("="*60)
print(f"Model: Stacking Ensemble (RF, ET, GB, AdaBoost, SVC + LR meta)")
print(f"Features: {len(features)} features")
print(f"New features: Name_Length, Ticket_Frequency, Sex_Pclass, Age_Bin, IsChild, FamilySize_Cat")
print(f"Data leakage fix: Age imputation from training data only")
print(f"\nCV Accuracy: {cv_scores.mean():.5f} (+/- {cv_scores.std():.5f})")
print(f"Previous best CV: 0.83388")
print(f"Previous best LB: 0.7799")
print("="*60)

EXPERIMENT 002 SUMMARY
Model: Stacking Ensemble (RF, ET, GB, AdaBoost, SVC + LR meta)
Features: 16 features
New features: Name_Length, Ticket_Frequency, Sex_Pclass, Age_Bin, IsChild, FamilySize_Cat
Data leakage fix: Age imputation from training data only

CV Accuracy: 0.82714 (+/- 0.01115)
Previous best CV: 0.83388
Previous best LB: 0.7799
