# Stacking Ensemble with Enhanced Features

**Goal:** Implement stacking following reference kernel that achieved 0.808 LB

**Approach:**
1. 7 diverse base models generate out-of-fold predictions
2. OOF predictions become features for meta-learner
3. Add Ticket_Frequency and Name_Length features
4. Use XGBoost or LogisticRegression as meta-learner

**Reference:** `research/kernels/arthurtok_introduction-to-ensembling-stacking-in-python/`

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              ExtraTreesClassifier, AdaBoostClassifier)
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
def extract_title(name):
    """Extract title from name using regex"""
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

def process_features(df, ticket_freq_map=None, is_train=True):
    """Process features with enhanced feature set including Ticket_Frequency and Name_Length"""
    df = df.copy()
    
    # 1. Title extraction
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                                        'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FamilySize_Bin'] = pd.cut(df['FamilySize'], bins=[0, 1, 4, 11], labels=[0, 1, 2]).astype(int)
    
    # 3. Cabin features
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')
    
    # 4. Name_Length (NEW - 0.33 correlation with survival)
    df['Name_Length'] = df['Name'].apply(len)
    
    # 5. Ticket_Frequency (NEW - strong signal: 2-3 shared = 57-70% survival, 5+ = 0%)
    if ticket_freq_map is None:
        ticket_freq_map = df['Ticket'].value_counts().to_dict()
    df['Ticket_Frequency'] = df['Ticket'].map(ticket_freq_map).fillna(1)
    
    # 6. Embarked - fill missing with mode
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 7. Fare - fill missing with median by Pclass
    if df['Fare'].isna().any():
        df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    
    return df, ticket_freq_map

# Process train first to get ticket frequency map
train_processed, ticket_freq_map = process_features(train, is_train=True)
# Use train's ticket frequency map for test (to avoid leakage)
test_processed, _ = process_features(test, ticket_freq_map=ticket_freq_map, is_train=False)

print("New features:")
print(f"Name_Length range: {train_processed['Name_Length'].min()} - {train_processed['Name_Length'].max()}")
print(f"Ticket_Frequency distribution:")
print(train_processed['Ticket_Frequency'].value_counts().head(10))

In [None]:
# Age imputation using median by Pclass, Sex, Title
def impute_age(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    age_medians = train_df.groupby(['Pclass', 'Sex', 'Title'])['Age'].median()
    
    def get_median_age(row, medians, fallback_median):
        if pd.isna(row['Age']):
            try:
                return medians.loc[(row['Pclass'], row['Sex'], row['Title'])]
            except KeyError:
                return fallback_median
        return row['Age']
    
    fallback = train_df['Age'].median()
    train_df['Age'] = train_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    test_df['Age'] = test_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    
    return train_df, test_df

train_processed, test_processed = impute_age(train_processed, test_processed)

# Age_Bin feature
def add_age_bin(df):
    df = df.copy()
    df['Age_Bin'] = pd.cut(df['Age'], bins=[0, 16, 32, 48, 100], labels=[0, 1, 2, 3]).astype(int)
    return df

train_processed = add_age_bin(train_processed)
test_processed = add_age_bin(test_processed)

print(f"Age missing: Train={train_processed['Age'].isna().sum()}, Test={test_processed['Age'].isna().sum()}")

In [None]:
# Encode categorical features
def encode_features(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Sex encoding
    train_df['Sex'] = train_df['Sex'].map({'female': 0, 'male': 1})
    test_df['Sex'] = test_df['Sex'].map({'female': 0, 'male': 1})
    
    # Embarked encoding
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
    test_df['Embarked'] = test_df['Embarked'].map(embarked_map)
    
    # Title encoding
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    train_df['Title'] = train_df['Title'].map(title_map)
    test_df['Title'] = test_df['Title'].map(title_map)
    
    # Deck encoding
    deck_map = {'U': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
    train_df['Deck'] = train_df['Deck'].map(deck_map)
    test_df['Deck'] = test_df['Deck'].map(deck_map)
    
    return train_df, test_df

train_encoded, test_encoded = encode_features(train_processed, test_processed)

# Feature set (16 features now with Name_Length and Ticket_Frequency)
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
            'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 
            'Deck', 'FamilySize_Bin', 'Age_Bin',
            'Name_Length', 'Ticket_Frequency']  # NEW features

X = train_encoded[features].values
y = train_encoded['Survived'].values
X_test = test_encoded[features].values

print(f"Feature matrix shape: {X.shape}")
print(f"Features ({len(features)}): {features}")

In [None]:
# Define base models for stacking (same as voting ensemble)
base_models = [
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=4, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=200, max_depth=6, min_samples_leaf=4, random_state=42)),
    ('ada', AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)),
    ('svc', SVC(kernel='rbf', C=1.0, probability=True, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, 
                          use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('lr', LogisticRegression(C=0.1, max_iter=1000, random_state=42))
]

print(f"Base models ({len(base_models)}):")
for name, model in base_models:
    print(f"  - {name}: {type(model).__name__}")

In [None]:
# STACKING: Generate out-of-fold predictions
print("Generating out-of-fold predictions for stacking...")
print("="*60)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Arrays to store OOF predictions
oof_train = np.zeros((len(X), len(base_models)))  # OOF predictions for training data
oof_test = np.zeros((len(X_test), len(base_models)))  # Test predictions

# Track individual model CV scores
model_cv_scores = {name: [] for name, _ in base_models}

for i, (name, model_template) in enumerate(base_models):
    print(f"\nProcessing {name}...")
    oof_test_folds = np.zeros((len(X_test), kfold.n_splits))
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Clone model for each fold
        from sklearn.base import clone
        model = clone(model_template)
        
        # Scale for SVC and LR
        if name in ['svc', 'lr']:
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_fold)
            X_val_scaled = scaler.transform(X_val_fold)
            X_test_scaled = scaler.transform(X_test)
            
            model.fit(X_train_scaled, y_train_fold)
            oof_train[val_idx, i] = model.predict_proba(X_val_scaled)[:, 1]
            oof_test_folds[:, fold] = model.predict_proba(X_test_scaled)[:, 1]
            val_pred = model.predict(X_val_scaled)
        else:
            model.fit(X_train_fold, y_train_fold)
            oof_train[val_idx, i] = model.predict_proba(X_val_fold)[:, 1]
            oof_test_folds[:, fold] = model.predict_proba(X_test)[:, 1]
            val_pred = model.predict(X_val_fold)
        
        model_cv_scores[name].append(accuracy_score(y_val_fold, val_pred))
    
    # Average test predictions across folds
    oof_test[:, i] = oof_test_folds.mean(axis=1)
    print(f"  {name} CV: {np.mean(model_cv_scores[name]):.4f} (+/- {np.std(model_cv_scores[name]):.4f})")

print("\n" + "="*60)
print("Base model CV scores:")
for name, scores in model_cv_scores.items():
    print(f"  {name:5s}: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

In [None]:
# STACKING: Train meta-learner on OOF predictions
print("\nTraining meta-learner on stacked features...")
print("="*60)

# Try both XGBoost and LogisticRegression as meta-learners
# XGBoost meta-learner (as in reference kernel)
meta_xgb = XGBClassifier(
    n_estimators=2000, 
    max_depth=4, 
    learning_rate=0.01,
    gamma=0.9, 
    subsample=0.8, 
    colsample_bytree=0.8,
    use_label_encoder=False, 
    eval_metric='logloss', 
    random_state=42
)

# LogisticRegression meta-learner (simpler, may generalize better)
meta_lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)

# Evaluate meta-learners with CV
kfold_meta = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# XGBoost meta-learner CV
xgb_cv_scores = []
for train_idx, val_idx in kfold_meta.split(oof_train, y):
    meta_xgb_clone = clone(meta_xgb)
    meta_xgb_clone.fit(oof_train[train_idx], y[train_idx])
    val_pred = meta_xgb_clone.predict(oof_train[val_idx])
    xgb_cv_scores.append(accuracy_score(y[val_idx], val_pred))

print(f"Meta XGBoost CV: {np.mean(xgb_cv_scores):.4f} (+/- {np.std(xgb_cv_scores):.4f})")

# LogisticRegression meta-learner CV
lr_cv_scores = []
for train_idx, val_idx in kfold_meta.split(oof_train, y):
    meta_lr_clone = clone(meta_lr)
    meta_lr_clone.fit(oof_train[train_idx], y[train_idx])
    val_pred = meta_lr_clone.predict(oof_train[val_idx])
    lr_cv_scores.append(accuracy_score(y[val_idx], val_pred))

print(f"Meta LR CV: {np.mean(lr_cv_scores):.4f} (+/- {np.std(lr_cv_scores):.4f})")

# Choose best meta-learner
if np.mean(xgb_cv_scores) >= np.mean(lr_cv_scores):
    best_meta = 'xgb'
    best_cv = np.mean(xgb_cv_scores)
    best_std = np.std(xgb_cv_scores)
    print(f"\nBest meta-learner: XGBoost")
else:
    best_meta = 'lr'
    best_cv = np.mean(lr_cv_scores)
    best_std = np.std(lr_cv_scores)
    print(f"\nBest meta-learner: LogisticRegression")

print(f"\nStacking CV: {best_cv:.4f} (+/- {best_std:.4f})")
print(f"Previous best (Voting Ensemble): 0.8372 (+/- 0.0239)")
print(f"Improvement: {best_cv - 0.8372:+.4f}")

In [None]:
# Train final meta-learner on all OOF predictions and generate test predictions
print("\nGenerating final predictions...")

if best_meta == 'xgb':
    final_meta = XGBClassifier(
        n_estimators=2000, max_depth=4, learning_rate=0.01,
        gamma=0.9, subsample=0.8, colsample_bytree=0.8,
        use_label_encoder=False, eval_metric='logloss', random_state=42
    )
else:
    final_meta = LogisticRegression(C=1.0, max_iter=1000, random_state=42)

final_meta.fit(oof_train, y)
test_predictions = final_meta.predict(oof_test)

# Also get probabilities for potential threshold tuning
test_proba = final_meta.predict_proba(oof_test)[:, 1]

print(f"Test predictions generated using {best_meta} meta-learner")
print(f"Prediction distribution: {(test_predictions == 0).sum()} died, {(test_predictions == 1).sum()} survived")

In [None]:
# Generate submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved to /home/submission/submission.csv")
print(f"\nSubmission preview:")
print(submission.head(10))
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())
print(f"\nComparison:")
print(f"  Baseline (XGBoost): 267 died, 151 survived")
print(f"  Voting Ensemble:    255 died, 163 survived")
print(f"  Stacking:           {(test_predictions == 0).sum()} died, {(test_predictions == 1).sum()} survived")

In [None]:
# Summary
print("\n" + "="*60)
print("EXPERIMENT SUMMARY")
print("="*60)
print(f"\nFeatures used: {len(features)}")
print(f"  - Original: Pclass, Sex, Age, SibSp, Parch, Fare, Embarked")
print(f"  - Engineered: Title, FamilySize, IsAlone, Has_Cabin, Deck, FamilySize_Bin, Age_Bin")
print(f"  - NEW: Name_Length, Ticket_Frequency")
print(f"\nBase models: {len(base_models)}")
print(f"Meta-learner: {best_meta}")
print(f"\nCV Scores:")
print(f"  Baseline (XGBoost):    0.8316 (+/- 0.0324)")
print(f"  Voting Ensemble:       0.8372 (+/- 0.0239)")
print(f"  Stacking:              {best_cv:.4f} (+/- {best_std:.4f})")
print(f"\nImprovement over baseline: {best_cv - 0.8316:+.4f}")
print(f"Improvement over voting:   {best_cv - 0.8372:+.4f}")
print(f"\nExpected LB (using calibration LB = 2.55*CV - 1.37):")
print(f"  Predicted LB: {2.55 * best_cv - 1.37:.4f}")
print(f"  Previous best LB: 0.7727")