# Experiment 001: Baseline with Feature Engineering

Following the seed prompt strategy:
- Comprehensive feature engineering (Title, FamilySize, AgeBand, FareBand, Has_Cabin)
- RandomForest baseline model
- StratifiedKFold cross-validation (k=5)

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
def feature_engineering(df, is_train=True):
    """Apply comprehensive feature engineering"""
    df = df.copy()
    
    # 1. Title extraction from Name (MOST IMPORTANT)
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    
    # Group rare titles
    rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # 2. Family Features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Has_Cabin feature
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    # 4. Sex encoding
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    
    # 5. Embarked - fill missing with mode 'S'
    df['Embarked'] = df['Embarked'].fillna('S')
    
    return df

# Apply feature engineering
train_fe = feature_engineering(train, is_train=True)
test_fe = feature_engineering(test, is_train=False)

print("Title distribution:")
print(train_fe['Title'].value_counts())

Title distribution:
Title
Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: count, dtype: int64


In [3]:
def impute_age(train_df, test_df):
    """Impute Age using median by Sex and Pclass combination"""
    combined = pd.concat([train_df, test_df], ignore_index=True)
    
    # Calculate median age by Sex and Pclass
    age_medians = combined.groupby(['Sex', 'Pclass'])['Age'].median()
    
    def fill_age(row):
        if pd.isna(row['Age']):
            return age_medians[(row['Sex'], row['Pclass'])]
        return row['Age']
    
    train_df['Age'] = train_df.apply(fill_age, axis=1)
    test_df['Age'] = test_df.apply(fill_age, axis=1)
    
    return train_df, test_df

# Impute Age
train_fe, test_fe = impute_age(train_fe, test_fe)

# Impute Fare (1 missing in test)
train_fe['Fare'] = train_fe['Fare'].fillna(train_fe['Fare'].median())
test_fe['Fare'] = test_fe['Fare'].fillna(test_fe['Fare'].median())

print(f"Missing values after imputation:")
print(f"Train Age missing: {train_fe['Age'].isna().sum()}")
print(f"Test Age missing: {test_fe['Age'].isna().sum()}")
print(f"Test Fare missing: {test_fe['Fare'].isna().sum()}")

Missing values after imputation:
Train Age missing: 0
Test Age missing: 0
Test Fare missing: 0


In [4]:
def create_bins(train_df, test_df):
    """Create Age and Fare bins"""
    # Age bins: 0-16, 16-32, 32-48, 48-64, 64+
    age_bins = [0, 16, 32, 48, 64, 100]
    train_df['AgeBand'] = pd.cut(train_df['Age'], bins=age_bins, labels=[0, 1, 2, 3, 4])
    test_df['AgeBand'] = pd.cut(test_df['Age'], bins=age_bins, labels=[0, 1, 2, 3, 4])
    
    # Fare bins: 4 quantiles based on training data
    fare_bins = [-1, 7.91, 14.454, 31.0, 600]
    train_df['FareBand'] = pd.cut(train_df['Fare'], bins=fare_bins, labels=[0, 1, 2, 3])
    test_df['FareBand'] = pd.cut(test_df['Fare'], bins=fare_bins, labels=[0, 1, 2, 3])
    
    # IsChild feature
    train_df['IsChild'] = (train_df['Age'] <= 16).astype(int)
    test_df['IsChild'] = (test_df['Age'] <= 16).astype(int)
    
    return train_df, test_df

train_fe, test_fe = create_bins(train_fe, test_fe)

print("AgeBand distribution:")
print(train_fe['AgeBand'].value_counts().sort_index())

AgeBand distribution:
AgeBand
0    100
1    493
2    218
3     69
4     11
Name: count, dtype: int64


In [5]:
# Prepare final features
feature_cols = ['Pclass', 'Sex', 'AgeBand', 'FareBand', 'FamilySize', 'IsAlone', 'Has_Cabin', 'IsChild']

# One-hot encode Title and Embarked
train_encoded = pd.get_dummies(train_fe[['Title', 'Embarked']], columns=['Title', 'Embarked'])
test_encoded = pd.get_dummies(test_fe[['Title', 'Embarked']], columns=['Title', 'Embarked'])

# Align columns
train_encoded, test_encoded = train_encoded.align(test_encoded, join='outer', axis=1, fill_value=0)

# Combine features
X_train = pd.concat([train_fe[feature_cols].astype(float), train_encoded], axis=1)
X_test = pd.concat([test_fe[feature_cols].astype(float), test_encoded], axis=1)
y_train = train_fe['Survived']

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"\nFeatures: {list(X_train.columns)}")

X_train shape: (891, 16)
X_test shape: (418, 16)

Features: ['Pclass', 'Sex', 'AgeBand', 'FareBand', 'FamilySize', 'IsAlone', 'Has_Cabin', 'IsChild', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']


In [6]:
# Cross-validation with StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# RandomForest baseline
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    criterion='entropy',
    random_state=42
)

# Cross-validation scores
cv_scores = cross_val_score(rf, X_train, y_train, cv=kfold, scoring='accuracy')

print(f"CV Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.5f} (+/- {cv_scores.std():.5f})")

CV Accuracy Scores: [0.83798883 0.81460674 0.8258427  0.83146067 0.83707865]
Mean CV Accuracy: 0.82940 (+/- 0.00858)


In [7]:
# Train on full training data and make predictions
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions.astype(int)
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
submission.to_csv('/home/code/experiments/001_baseline/submission.csv', index=False)

print(f"Submission shape: {submission.shape}")
print(f"\nPrediction distribution:")
print(submission['Survived'].value_counts())
print(f"\nFirst 10 rows:")
print(submission.head(10))

Submission shape: (418, 2)

Prediction distribution:
Survived
0    255
1    163
Name: count, dtype: int64

First 10 rows:
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0


In [8]:
# Feature importance
importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(importance)

Feature Importance:
         feature  importance
10      Title_Mr    0.231436
1            Sex    0.176498
0         Pclass    0.119680
4     FamilySize    0.090180
3       FareBand    0.068441
6      Has_Cabin    0.062281
9     Title_Miss    0.054574
11     Title_Mrs    0.046749
2        AgeBand    0.042069
8   Title_Master    0.021684
13    Embarked_C    0.018069
15    Embarked_S    0.016684
5        IsAlone    0.016123
12    Title_Rare    0.013822
7        IsChild    0.013196
14    Embarked_Q    0.008515
