# Titanic Experiment 001: Validated Features

Implementing all validated features from analysis notebooks:
- TicketFreq (family/group patterns)
- CabinSide (odd/even cabin location)
- NameLength (social status)
- FareBin5 (granular fare binning)
- Interaction features (Pclass_Sex, AgeGroup_Sex, FareBin5_Sex)

Expected CV improvement: 0.817 → 0.8305 (+0.0135)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (891, 12)
Test shape: (418, 11)


In [2]:
# Feature engineering with ALL validated features
def create_features(df):
    df = df.copy()
    
    # 1. Extract title from name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)
    title_mapping = {
        'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 'Master': 'Master',
        'Dr': 'Other', 'Rev': 'Other', 'Col': 'Other', 'Major': 'Other',
        'Mlle': 'Miss', 'Countess': 'Other', 'Ms': 'Miss', 'Lady': 'Other',
        'Jonkheer': 'Other', 'Don': 'Other', 'Dona': 'Other', 'Mme': 'Mrs',
        'Capt': 'Other', 'Sir': 'Other'
    }
    df['Title'] = df['Title'].map(title_mapping)
    
    # 2. Family size and IsAlone
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                           labels=['Child', 'Teen', 'Adult', 'MiddleAge', 'Senior'])
    
    # 4. Fare per person
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    
    # 5. Deck from cabin
    df['Deck'] = df['Cabin'].str[0]
    df['Deck'] = df['Deck'].fillna('Unknown')
    
    # 6. Ticket frequency (validated feature - captures family/group patterns)
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    
    # 7. Cabin side (validated feature - odd/even cabin location)
    df['CabinNumber'] = df['Cabin'].str.extract('([0-9]+)', expand=False).astype(float)
    df['CabinSide'] = df['CabinNumber'] % 2
    df['CabinSide'] = df['CabinSide'].map({0.0: 'Even', 1.0: 'Odd', np.nan: 'Unknown'})
    
    # 8. Name length (validated feature - social status)
    df['NameLength'] = df['Name'].str.len()
    
    # 9. Fare binning with 5 categories (validated feature - granular wealth effects)
    df['FareBin5'] = pd.qcut(df['Fare'], q=5, labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh'])
    
    return df

# Create features for both datasets
train_feat = create_features(train_df)
test_feat = create_features(test_df)

print("Features created successfully")
print("New columns:", [col for col in train_feat.columns if col not in train_df.columns])

Features created successfully
New columns: ['Title', 'FamilySize', 'IsAlone', 'AgeGroup', 'FarePerPerson', 'Deck', 'TicketFreq', 'CabinNumber', 'CabinSide', 'NameLength', 'FareBin5']


In [3]:
# Create interaction features (validated - address class-gender misclassifications)
def create_interactions(df):
    df = df.copy()
    
    # Pclass_Sex interaction
    df['Pclass_Sex'] = df['Pclass'].astype(str) + '_' + df['Sex']
    
    # AgeGroup_Sex interaction
    df['AgeGroup_Sex'] = df['AgeGroup'].astype(str) + '_' + df['Sex']
    
    # FareBin5_Sex interaction
    df['FareBin5_Sex'] = df['FareBin5'].astype(str) + '_' + df['Sex']
    
    return df

train_feat = create_interactions(train_feat)
test_feat = create_interactions(test_feat)

print("Interaction features created")
print("Interaction columns:", ['Pclass_Sex', 'AgeGroup_Sex', 'FareBin5_Sex'])

Interaction features created
Interaction columns: ['Pclass_Sex', 'AgeGroup_Sex', 'FareBin5_Sex']


In [4]:
# Define feature columns
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson', 'NameLength', 'TicketFreq']

categorical_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'AgeGroup', 'Deck', 
                       'CabinSide', 'FareBin5', 'Pclass_Sex', 'AgeGroup_Sex', 'FareBin5_Sex']

# Prepare data
X = train_feat[numeric_features + categorical_features]
y = train_feat['Survived']
X_test = test_feat[numeric_features + categorical_features]

print("Training features shape:", X.shape)
print("Test features shape:", X_test.shape)
print("\nNumeric features:", numeric_features)
print("\nCategorical features:", categorical_features)

Training features shape: (891, 20)
Test features shape: (418, 20)

Numeric features: ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'FarePerPerson', 'NameLength', 'TicketFreq']

Categorical features: ['Pclass', 'Sex', 'Embarked', 'Title', 'IsAlone', 'AgeGroup', 'Deck', 'CabinSide', 'FareBin5', 'Pclass_Sex', 'AgeGroup_Sex', 'FareBin5_Sex']


In [5]:
# Create preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create full pipeline with RandomForest
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
])

print("Pipeline created")

Pipeline created


In [6]:
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
print(f"Individual fold scores: {cv_scores}")
print(f"\nImprovement over baseline: +{cv_scores.mean() - 0.8170:.4f}")
print(f"Expected: +0.0135 (to reach 0.8305)")

CV Accuracy: 0.8283 ± 0.0160
Individual fold scores: [0.8547486  0.82022472 0.80898876 0.83707865 0.82022472]

Improvement over baseline: +0.0113
Expected: +0.0135 (to reach 0.8305)


In [7]:
# Train on full data and predict
clf.fit(X, y)

# Make predictions
test_predictions = clf.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})

print("Submission shape:", submission.shape)
print("\nFirst 10 predictions:")
print(submission.head(10))

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("\nSubmission saved to /home/submission/submission.csv")

Submission shape: (418, 2)

First 10 predictions:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
5          897         0
6          898         0
7          899         0
8          900         1
9          901         0

Submission saved to /home/submission/submission.csv


In [8]:
# Check prediction distribution and feature importance
print("Prediction distribution:")
print(submission['Survived'].value_counts())
print(f"Survival rate in predictions: {submission['Survived'].mean():.3f}")
print(f"Survival rate in training: {y.mean():.3f}")

# Get feature importance
print("\n" + "="*50)
print("TOP 15 FEATURE IMPORTANCES")
print("="*50)

# Fit preprocessor and get feature names
X_preprocessed = preprocessor.fit_transform(X)

# Get feature names
if hasattr(preprocessor.named_transformers_['cat']['encoder'], 'get_feature_names_out'):
    cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_features)
else:
    cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(categorical_features)

all_feature_names = np.concatenate([numeric_features, cat_feature_names])

# Get importances
importances = clf.named_steps['classifier'].feature_importances_
top_indices = np.argsort(importances)[::-1][:15]

for idx in top_indices:
    print(f"{all_feature_names[idx]:<30} {importances[idx]:.4f}")

Prediction distribution:
Survived
0    270
1    148
Name: count, dtype: int64
Survival rate in predictions: 0.354
Survival rate in training: 0.384

TOP 15 FEATURE IMPORTANCES
NameLength                     0.1052
Age                            0.0891
FarePerPerson                  0.0751
Fare                           0.0706
Sex_female                     0.0680
Title_Mr                       0.0599
Sex_male                       0.0541
TicketFreq                     0.0273
Pclass_Sex_1_female            0.0254
FamilySize                     0.0248
FareBin5_Sex_VeryHigh_female   0.0234
Pclass_Sex_2_female            0.0230
Pclass_Sex_3_male              0.0192
Pclass_3                       0.0168
SibSp                          0.0144
