# Voting Ensemble with Title Feature

## Goal: Improve LB score using ensemble diversity

Current best: LB 0.7775 (Simple RF, 7 features)

This experiment:
- Add Title feature back (captures sex + social status)
- Use 8 features: Pclass, Sex_Code, Age, SibSp, Parch, Fare, Embarked_Code, Title_Code
- Voting ensemble: RF, LogisticRegression, GradientBoosting, SVC
- Soft voting (probability averaging)
- Keep models simple to avoid overfitting

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

In [None]:
# Feature engineering with Title
def preprocess_with_title(train_df, test_df):
    """Preprocessing with Title feature added"""
    train_data = train_df.copy()
    test_data = test_df.copy()
    
    # 1. Title extraction from Name
    for df in [train_data, test_data]:
        df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
        # Group rare titles
        title_mapping = {
            'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
            'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
            'Lady': 'Rare', 'Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
            'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
            'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
        }
        df['Title'] = df['Title'].map(title_mapping).fillna('Rare')
    
    # 2. Title encoding (consistent across train/test)
    title_order = ['Mr', 'Miss', 'Mrs', 'Master', 'Rare']
    title_map = {t: i for i, t in enumerate(title_order)}
    train_data['Title_Code'] = train_data['Title'].map(title_map)
    test_data['Title_Code'] = test_data['Title'].map(title_map)
    
    # 3. Sex encoding
    train_data['Sex_Code'] = (train_data['Sex'] == 'male').astype(int)
    test_data['Sex_Code'] = (test_data['Sex'] == 'male').astype(int)
    
    # 4. Embarked - fill missing with mode 'S', then encode
    train_data['Embarked'] = train_data['Embarked'].fillna('S')
    test_data['Embarked'] = test_data['Embarked'].fillna('S')
    embarked_map = {'S': 0, 'C': 1, 'Q': 2}
    train_data['Embarked_Code'] = train_data['Embarked'].map(embarked_map)
    test_data['Embarked_Code'] = test_data['Embarked'].map(embarked_map)
    
    # 5. Fare - fill missing with median from TRAIN only
    train_fare_median = train_data['Fare'].median()
    train_data['Fare'] = train_data['Fare'].fillna(train_fare_median)
    test_data['Fare'] = test_data['Fare'].fillna(train_fare_median)
    
    # 6. Age - fill missing with median from TRAIN only (by Title and Pclass)
    age_medians = train_data.groupby(['Title', 'Pclass'])['Age'].median()
    train_age_median = train_data['Age'].median()
    
    def fill_age(row, medians, fallback):
        if pd.isna(row['Age']):
            try:
                return medians[(row['Title'], row['Pclass'])]
            except KeyError:
                return fallback
        return row['Age']
    
    train_data['Age'] = train_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    test_data['Age'] = test_data.apply(lambda x: fill_age(x, age_medians, train_age_median), axis=1)
    
    return train_data, test_data

train_processed, test_processed = preprocess_with_title(train, test)

print("Title distribution:")
print(train_processed['Title'].value_counts())

In [None]:
# Define feature set (8 features)
feature_cols = ['Pclass', 'Sex_Code', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Code', 'Title_Code']

print(f"Features ({len(feature_cols)}): {feature_cols}")

# Prepare data
X = train_processed[feature_cols].values
y = train_processed['Survived'].values
X_test = test_processed[feature_cols].values
test_ids = test_processed['PassengerId'].values

print(f"\nX shape: {X.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Define base models (simple, regularized)
rf = RandomForestClassifier(
    n_estimators=100, 
    max_depth=5, 
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

lr = LogisticRegression(
    C=1.0,
    max_iter=1000,
    random_state=42
)

gb = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    min_samples_leaf=5,
    random_state=42
)

svc = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    probability=True,
    random_state=42
)

print("Base models defined:")
print("- RandomForest (max_depth=5)")
print("- LogisticRegression (C=1.0)")
print("- GradientBoosting (max_depth=3)")
print("- SVC (rbf kernel, C=1.0)")

In [None]:
# Create Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('lr', lr),
        ('gb', gb),
        ('svc', svc)
    ],
    voting='soft'  # Use probability averaging
)

print("Voting Classifier created with soft voting")

In [None]:
# 5-Fold Stratified CV
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
oof_probs = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

# Also track individual model scores
model_scores = {'rf': [], 'lr': [], 'gb': [], 'svc': []}

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Scale features for LR and SVC
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Train voting classifier
    voting_clf.fit(X_train_scaled, y_train)
    
    # Predict on validation
    val_pred = voting_clf.predict(X_val_scaled)
    val_prob = voting_clf.predict_proba(X_val_scaled)[:, 1]
    oof_preds[val_idx] = val_pred
    oof_probs[val_idx] = val_prob
    
    # Predict on test
    test_preds += voting_clf.predict_proba(X_test_scaled)[:, 1] / kfold.n_splits
    
    # Calculate fold accuracy
    fold_acc = accuracy_score(y_val, val_pred)
    fold_scores.append(fold_acc)
    
    # Track individual model scores
    for name, model in voting_clf.named_estimators_.items():
        model_pred = model.predict(X_val_scaled)
        model_scores[name].append(accuracy_score(y_val, model_pred))
    
    print(f"Fold {fold+1}: Ensemble={fold_acc:.4f} | RF={model_scores['rf'][-1]:.4f} | LR={model_scores['lr'][-1]:.4f} | GB={model_scores['gb'][-1]:.4f} | SVC={model_scores['svc'][-1]:.4f}")

# Overall CV score
cv_score = accuracy_score(y, oof_preds)
print(f"\n{'='*70}")
print(f"Overall CV Accuracy: {cv_score:.4f} (+/- {np.std(fold_scores):.4f})")

In [None]:
# Compare individual models vs ensemble
print("\nModel Comparison (Mean CV Accuracy):")
print(f"  RandomForest:     {np.mean(model_scores['rf']):.4f} (+/- {np.std(model_scores['rf']):.4f})")
print(f"  LogisticReg:      {np.mean(model_scores['lr']):.4f} (+/- {np.std(model_scores['lr']):.4f})")
print(f"  GradientBoosting: {np.mean(model_scores['gb']):.4f} (+/- {np.std(model_scores['gb']):.4f})")
print(f"  SVC:              {np.mean(model_scores['svc']):.4f} (+/- {np.std(model_scores['svc']):.4f})")
print(f"  ENSEMBLE:         {cv_score:.4f} (+/- {np.std(fold_scores):.4f})")

In [None]:
# Create submission
test_preds_binary = (test_preds >= 0.5).astype(int)

submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Survived': test_preds_binary
})

submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} rows")
print(f"\nSurvived distribution:")
print(submission['Survived'].value_counts())
print(f"\nSurvival rate: {submission['Survived'].mean():.3f}")

In [None]:
# Summary
print("\n" + "="*70)
print("EXPERIMENT SUMMARY")
print("="*70)
print(f"Model: Voting Ensemble (RF + LR + GB + SVC) with soft voting")
print(f"Features: {len(feature_cols)} - {feature_cols}")
print(f"CV Accuracy: {cv_score:.4f} (+/- {np.std(fold_scores):.4f})")
print(f"\nComparison to previous experiments:")
print(f"  exp_000 (XGBoost 13 features): CV=0.8316, LB=0.7584")
print(f"  exp_001 (Simple RF 7 features): CV=0.8238, LB=0.7775")
print(f"  exp_002 (Voting Ensemble 8 features): CV={cv_score:.4f}")
print(f"\nExpected LB (assuming 4-5% gap): {cv_score - 0.05:.4f} to {cv_score - 0.04:.4f}")
print(f"Submission survival rate: {submission['Survived'].mean():.3f}")

In [None]:
# Analyze prediction confidence
print("\nPrediction Confidence Analysis:")
print(f"Mean probability: {test_preds.mean():.3f}")
print(f"Std probability: {test_preds.std():.3f}")
print(f"\nConfidence distribution:")
print(f"  Very confident (p<0.2 or p>0.8): {((test_preds < 0.2) | (test_preds > 0.8)).sum()} ({((test_preds < 0.2) | (test_preds > 0.8)).mean()*100:.1f}%)")
print(f"  Uncertain (0.4 < p < 0.6): {((test_preds > 0.4) & (test_preds < 0.6)).sum()} ({((test_preds > 0.4) & (test_preds < 0.6)).mean()*100:.1f}%)")

# Compare with previous best (Simple RF)
print(f"\nNote: Previous best (Simple RF) had survival rate 31.3%")
print(f"This model has survival rate {submission['Survived'].mean()*100:.1f}%")