# Family/Ticket Survival Rate Encoding + Voting Ensemble

**Goal:** Add the strongest predictive features identified:
- Family_Survival_Rate (surname-based): 0.76 correlation with Survived
- Ticket_Survival_Rate: 0.81 correlation with Survived

**CRITICAL:** This is target encoding - must recalculate rates within each CV fold to avoid leakage!

**Reference:** Advanced FE kernel achieved 0.837 LB with this technique

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                              ExtraTreesClassifier, AdaBoostClassifier)
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
def extract_title(name):
    """Extract title from name using regex"""
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

def extract_surname(name):
    """Extract surname from name"""
    return name.split(',')[0]

def process_base_features(df):
    """Process base features (non-target-encoded)"""
    df = df.copy()
    
    # Title extraction
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                                        'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # Surname extraction (for survival rate encoding)
    df['Surname'] = df['Name'].apply(extract_surname)
    
    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FamilySize_Bin'] = pd.cut(df['FamilySize'], bins=[0, 1, 4, 11], labels=[0, 1, 2]).astype(int)
    
    # Cabin features
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'U')
    
    # Name_Length
    df['Name_Length'] = df['Name'].apply(len)
    
    # Embarked - fill missing with mode
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # Fare - fill missing with median by Pclass
    if df['Fare'].isna().any():
        df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    
    return df

# Process base features
train_processed = process_base_features(train)
test_processed = process_base_features(test)

print("Surname examples:")
print(train_processed[['Name', 'Surname']].head())

In [None]:
# Age imputation using median by Pclass, Sex, Title
def impute_age(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    age_medians = train_df.groupby(['Pclass', 'Sex', 'Title'])['Age'].median()
    
    def get_median_age(row, medians, fallback_median):
        if pd.isna(row['Age']):
            try:
                return medians.loc[(row['Pclass'], row['Sex'], row['Title'])]
            except KeyError:
                return fallback_median
        return row['Age']
    
    fallback = train_df['Age'].median()
    train_df['Age'] = train_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    test_df['Age'] = test_df.apply(lambda x: get_median_age(x, age_medians, fallback), axis=1)
    
    return train_df, test_df

train_processed, test_processed = impute_age(train_processed, test_processed)

# Age_Bin feature
def add_age_bin(df):
    df = df.copy()
    df['Age_Bin'] = pd.cut(df['Age'], bins=[0, 16, 32, 48, 100], labels=[0, 1, 2, 3]).astype(int)
    return df

train_processed = add_age_bin(train_processed)
test_processed = add_age_bin(test_processed)

print(f"Age missing: Train={train_processed['Age'].isna().sum()}, Test={test_processed['Age'].isna().sum()}")

In [None]:
# Encode categorical features
def encode_categorical(train_df, test_df):
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Sex encoding
    train_df['Sex'] = train_df['Sex'].map({'female': 0, 'male': 1})
    test_df['Sex'] = test_df['Sex'].map({'female': 0, 'male': 1})
    
    # Embarked encoding
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
    test_df['Embarked'] = test_df['Embarked'].map(embarked_map)
    
    # Title encoding
    title_map = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    train_df['Title'] = train_df['Title'].map(title_map)
    test_df['Title'] = test_df['Title'].map(title_map)
    
    # Deck encoding
    deck_map = {'U': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
    train_df['Deck'] = train_df['Deck'].map(deck_map)
    test_df['Deck'] = test_df['Deck'].map(deck_map)
    
    return train_df, test_df

train_encoded, test_encoded = encode_categorical(train_processed, test_processed)
print("Categorical encoding complete")

In [None]:
# CRITICAL: Family/Ticket Survival Rate Encoding
# Must be calculated within CV folds to avoid leakage!

def calculate_survival_rates(train_fold_df, default_rate=0.5):
    """
    Calculate family and ticket survival rates from training fold only.
    Returns dictionaries for mapping.
    """
    # Family survival rate (by surname)
    family_survival = train_fold_df.groupby('Surname')['Survived'].mean().to_dict()
    
    # Ticket survival rate
    ticket_survival = train_fold_df.groupby('Ticket')['Survived'].mean().to_dict()
    
    return family_survival, ticket_survival

def apply_survival_rates(df, family_survival, ticket_survival, default_rate=0.5):
    """
    Apply survival rate encoding to a dataframe.
    """
    df = df.copy()
    
    # Map survival rates
    df['Family_Survival_Rate'] = df['Surname'].map(family_survival)
    df['Ticket_Survival_Rate'] = df['Ticket'].map(ticket_survival)
    
    # Create NA indicators BEFORE filling
    df['Family_Survival_NA'] = df['Family_Survival_Rate'].isna().astype(int)
    df['Ticket_Survival_NA'] = df['Ticket_Survival_Rate'].isna().astype(int)
    
    # Fill missing with default rate
    df['Family_Survival_Rate'] = df['Family_Survival_Rate'].fillna(default_rate)
    df['Ticket_Survival_Rate'] = df['Ticket_Survival_Rate'].fillna(default_rate)
    
    # Combined survival rate
    df['Survival_Rate'] = (df['Family_Survival_Rate'] + df['Ticket_Survival_Rate']) / 2
    
    return df

# Test the encoding on full train to see coverage
family_surv, ticket_surv = calculate_survival_rates(train_processed.assign(Survived=train['Survived']))
test_with_rates = apply_survival_rates(test_processed, family_surv, ticket_surv)

print("Test set coverage:")
print(f"  Family rate known: {(test_with_rates['Family_Survival_NA'] == 0).sum()}/{len(test_with_rates)} ({100*(test_with_rates['Family_Survival_NA'] == 0).mean():.1f}%)")
print(f"  Ticket rate known: {(test_with_rates['Ticket_Survival_NA'] == 0).sum()}/{len(test_with_rates)} ({100*(test_with_rates['Ticket_Survival_NA'] == 0).mean():.1f}%)")
print(f"  Either known: {((test_with_rates['Family_Survival_NA'] == 0) | (test_with_rates['Ticket_Survival_NA'] == 0)).sum()}/{len(test_with_rates)}")

In [None]:
# Define base features (without survival rate encoding - those are added per fold)
base_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 
                 'Title', 'FamilySize', 'IsAlone', 'Has_Cabin', 
                 'Deck', 'FamilySize_Bin', 'Age_Bin', 'Name_Length']

# Survival rate features (added per fold)
survival_features = ['Family_Survival_Rate', 'Ticket_Survival_Rate', 
                     'Family_Survival_NA', 'Ticket_Survival_NA', 'Survival_Rate']

all_features = base_features + survival_features

print(f"Base features: {len(base_features)}")
print(f"Survival rate features: {len(survival_features)}")
print(f"Total features: {len(all_features)}")

In [None]:
# Define voting ensemble models (same as exp_001)
models = [
    ('lr', LogisticRegression(C=0.1, max_iter=1000, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=4, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)),
    ('et', ExtraTreesClassifier(n_estimators=200, max_depth=6, min_samples_leaf=4, random_state=42)),
    ('ada', AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=42)),
    ('svc', SVC(kernel='rbf', C=1.0, probability=True, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, 
                          use_label_encoder=False, eval_metric='logloss', random_state=42))
]

print(f"Models in ensemble: {len(models)}")
for name, model in models:
    print(f"  - {name}: {type(model).__name__}")

In [None]:
# 10-fold Stratified CV with proper survival rate encoding per fold
print("Running 10-fold CV with survival rate encoding...")
print("="*60)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

y = train['Survived'].values
cv_scores = []
oof_predictions = np.zeros(len(train))
test_predictions = np.zeros(len(test))

# Track individual model scores
individual_scores = {name: [] for name, _ in models}

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_encoded, y)):
    # Get fold data
    train_fold = train_encoded.iloc[train_idx].copy()
    val_fold = train_encoded.iloc[val_idx].copy()
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    
    # CRITICAL: Calculate survival rates from TRAINING FOLD ONLY
    train_fold_with_target = train_fold.assign(Survived=y_train_fold)
    family_surv, ticket_surv = calculate_survival_rates(train_fold_with_target)
    
    # Apply survival rates to train fold, val fold, and test
    train_fold = apply_survival_rates(train_fold, family_surv, ticket_surv)
    val_fold = apply_survival_rates(val_fold, family_surv, ticket_surv)
    test_fold = apply_survival_rates(test_encoded.copy(), family_surv, ticket_surv)
    
    # Extract features
    X_train = train_fold[all_features].values
    X_val = val_fold[all_features].values
    X_test = test_fold[all_features].values
    
    # Scale for SVC and LR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Train models and collect predictions for soft voting
    fold_probs = np.zeros((len(X_val), 2))
    test_fold_probs = np.zeros((len(X_test), 2))
    
    for name, model_template in models:
        model = clone(model_template)
        
        if name in ['lr', 'svc']:
            model.fit(X_train_scaled, y_train_fold)
            val_prob = model.predict_proba(X_val_scaled)
            test_prob = model.predict_proba(X_test_scaled)
            val_pred = model.predict(X_val_scaled)
        else:
            model.fit(X_train, y_train_fold)
            val_prob = model.predict_proba(X_val)
            test_prob = model.predict_proba(X_test)
            val_pred = model.predict(X_val)
        
        fold_probs += val_prob
        test_fold_probs += test_prob
        individual_scores[name].append(accuracy_score(y_val_fold, val_pred))
    
    # Average probabilities (soft voting)
    fold_probs /= len(models)
    test_fold_probs /= len(models)
    
    # Ensemble predictions
    val_pred_ensemble = (fold_probs[:, 1] >= 0.5).astype(int)
    oof_predictions[val_idx] = val_pred_ensemble
    
    # Accumulate test predictions
    test_predictions += test_fold_probs[:, 1] / kfold.n_splits
    
    # Calculate fold accuracy
    fold_acc = accuracy_score(y_val_fold, val_pred_ensemble)
    cv_scores.append(fold_acc)
    print(f"Fold {fold+1}: Accuracy = {fold_acc:.4f}")

print(f"\n{'='*60}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"Overall OOF Accuracy: {accuracy_score(y, oof_predictions):.4f}")

In [None]:
# Individual model performance
print("\nIndividual Model Performance:")
print("="*50)
for name, scores in individual_scores.items():
    print(f"{name:5s}: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

print(f"\nEnsemble: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"\nComparison:")
print(f"  Baseline (XGBoost):           0.8316 (+/- 0.0324)")
print(f"  Voting Ensemble (exp_001):    0.8372 (+/- 0.0239)")
print(f"  + Survival Rate Encoding:     {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"\nImprovement over voting: {np.mean(cv_scores) - 0.8372:+.4f}")

In [None]:
# Generate submission
test_pred_binary = (test_predictions >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved to /home/submission/submission.csv")
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())
print(f"\nComparison:")
print(f"  Baseline:         267 died, 151 survived")
print(f"  Voting Ensemble:  255 died, 163 survived")
print(f"  + Survival Rate:  {(test_pred_binary == 0).sum()} died, {(test_pred_binary == 1).sum()} survived")

In [None]:
# Summary and expected LB
print("\n" + "="*60)
print("EXPERIMENT SUMMARY")
print("="*60)
print(f"\nFeatures: {len(all_features)} total")
print(f"  Base features: {len(base_features)}")
print(f"  NEW Survival rate features: {len(survival_features)}")
print(f"    - Family_Survival_Rate (0.76 correlation)")
print(f"    - Ticket_Survival_Rate (0.81 correlation)")
print(f"    - NA indicators and combined rate")
print(f"\nCV Score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print(f"\nExpected LB (using calibration LB = 2.55*CV - 1.37):")
expected_lb = 2.55 * np.mean(cv_scores) - 1.37
print(f"  Predicted LB: {expected_lb:.4f}")
print(f"  Previous best LB: 0.7727")
print(f"  Expected improvement: {expected_lb - 0.7727:+.4f}")