# Titanic: Fixed Preprocessing Pipeline

This notebook fixes the data leakage issue by implementing proper sklearn pipelines that fit preprocessing on training data only.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Training data shape: (891, 12)
Test data shape: (418, 11)


## Feature Engineering Functions

Define functions to create new features that will be applied within the pipeline.

In [2]:
def engineer_features(df):
    """Engineer new features from raw data"""
    df = df.copy()
    
    # Extract title from name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
    
    # Map rare titles to more common ones
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Other', 'Rev': 'Other', 'Col': 'Other', 'Major': 'Other',
        'Mlle': 'Miss', 'Countess': 'Other', 'Ms': 'Miss', 'Lady': 'Other',
        'Jonkheer': 'Other', 'Don': 'Other', 'Dona': 'Other', 'Mme': 'Mrs',
        'Capt': 'Other', 'Sir': 'Other'
    }
    df['Title'] = df['Title'].map(title_mapping)
    
    # Create family size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Create IsAlone flag (NEW FEATURE)
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Create age bins (NEW FEATURE)
    df['AgeBin'] = pd.cut(df['Age'], 
                         bins=[0, 12, 18, 35, 60, 100], 
                         labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])
    
    # Create fare per person (NEW FEATURE)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # Create cabin indicator
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    
    return df

# Apply feature engineering
train_fe = engineer_features(train_df)
test_fe = engineer_features(test_df)

print("New features created:")
print(train_fe[['Title', 'FamilySize', 'IsAlone', 'AgeBin', 'FarePerPerson', 'HasCabin']].head())

New features created:
  Title  FamilySize  IsAlone      AgeBin  FarePerPerson  HasCabin
0    Mr           2        0  YoungAdult        3.62500         0
1   Mrs           2        0       Adult       35.64165         1
2  Miss           1        1  YoungAdult        7.92500         0
3   Mrs           2        0  YoungAdult       26.55000         1
4    Mr           1        1  YoungAdult        8.05000         0


## Define Preprocessing Pipeline

Create separate preprocessing pipelines for numeric and categorical features.

In [3]:
# Separate target and features
target = train_fe['Survived']
train_features = train_fe.drop(['Survived', 'PassengerId'], axis=1)
test_features = test_fe.drop(['PassengerId'], axis=1)

# Define feature groups
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson', 'HasCabin']
categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'AgeBin']

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {categorical_features}")

Numeric features: ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson', 'HasCabin']
Categorical features: ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'AgeBin']


## Create Full Pipeline with XGBoost

Combine preprocessing with XGBoost model.

In [None]:
# Define XGBoost model with better hyperparameters
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=4,
    eval_metric='logloss'
)

# Create full pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

print("Pipeline created successfully!")

## Cross-Validation with Proper Preprocessing

The key improvement: preprocessing happens INSIDE the CV loop, preventing leakage.

In [None]:
# Set up cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation scores
cv_scores = []
fold = 1

print("Training with 5-fold cross-validation (preprocessing inside loop)...")
for train_idx, val_idx in skf.split(train_features, target):
    X_train, X_val = train_features.iloc[train_idx], train_features.iloc[val_idx]
    y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]
    
    # Clone pipeline for this fold
    fold_clf = Pipeline(clf.steps)
    
    # Train model
    fold_clf.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = fold_clf.predict(X_val)
    score = accuracy_score(y_val, y_pred)
    cv_scores.append(score)
    
    print(f"Fold {fold}: Accuracy = {score:.4f}")
    fold += 1

print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean accuracy: {np.mean(cv_scores):.4f} Â± {np.std(cv_scores):.4f}")

## Train on Full Data and Generate Predictions

In [None]:
# Train on full training data
clf.fit(train_features, target)

# Generate predictions for test set
y_pred_test = clf.predict(test_features)

print(f"Test predictions shape: {y_pred_test.shape}")
print(f"Prediction distribution: {np.bincount(y_pred_test)}")

## Create Submission File

In [None]:
# Create submission dataframe
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_test
})

print("Submission preview:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")

# Verify submission format
print("\nVerifying submission format...")
print(f"Columns: {list(submission.columns)}")
print(f"Number of rows: {len(submission)}")
print(f"Expected rows: 418")
print(f"PassengerId range: {submission['PassengerId'].min()} to {submission['PassengerId'].max()}")

# Feature importance analysis
print("\n" + "="*50)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*50)

# Get feature names after preprocessing
numeric_features_processed = numeric_features
categorical_features_processed = list(clf.named_steps['preprocessor']
                                    .named_transformers_['cat']
                                    .named_steps['onehot']
                                    .get_feature_names_out(categorical_features))

all_features = numeric_features_processed + categorical_features_processed

# Get feature importances
importances = clf.named_steps['classifier'].feature_importances_

# Create importance dataframe
feature_importance_df = pd.DataFrame({
    'feature': all_features[:len(importances)],  # Ensure same length
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance_df.head(10))