# Experiment 001: Baseline Random Forest with Feature Engineering

Following the strategy:
1. Feature engineering: Title extraction, FamilySize, IsAlone, Has_Cabin
2. Missing value handling: Age by Title median, Embarked by mode, Fare by Pclass median
3. Model: Random Forest with Stratified 5-Fold CV

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Survived'].value_counts(normalize=True))

Train shape: (891, 12)
Test shape: (418, 11)

Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [2]:
def engineer_features(df, train_df=None):
    """Apply feature engineering based on strategy"""
    df = df.copy()
    
    # 1. Title extraction (MOST IMPORTANT)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Cabin features
    df['Has_Cabin'] = df['Cabin'].notna().astype(int)
    
    return df

# Apply feature engineering
train = engineer_features(train)
test = engineer_features(test)

print("Titles in train:", train['Title'].value_counts().to_dict())

Titles in train: {'Mr': 517, 'Miss': 185, 'Mrs': 126, 'Master': 40, 'Rare': 23}


In [3]:
def fill_missing_values(train_df, test_df):
    """Fill missing values using training data statistics"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Combine for consistent imputation
    combined = pd.concat([train_df, test_df], sort=False)
    
    # Age: Fill with median by Title (calculated from combined data)
    title_age_median = combined.groupby('Title')['Age'].median()
    
    for df in [train_df, test_df]:
        for title in df['Title'].unique():
            mask = (df['Title'] == title) & (df['Age'].isna())
            if title in title_age_median:
                df.loc[mask, 'Age'] = title_age_median[title]
            else:
                df.loc[mask, 'Age'] = combined['Age'].median()
    
    # Embarked: Fill with mode from training data
    embarked_mode = train_df['Embarked'].mode()[0]
    train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)
    test_df['Embarked'] = test_df['Embarked'].fillna(embarked_mode)
    
    # Fare: Fill with median by Pclass from training data
    for pclass in [1, 2, 3]:
        fare_median = train_df[train_df['Pclass'] == pclass]['Fare'].median()
        train_df.loc[(train_df['Pclass'] == pclass) & (train_df['Fare'].isna()), 'Fare'] = fare_median
        test_df.loc[(test_df['Pclass'] == pclass) & (test_df['Fare'].isna()), 'Fare'] = fare_median
    
    return train_df, test_df

train, test = fill_missing_values(train, test)

print("Missing values after imputation:")
print("Train:", train[['Age', 'Embarked', 'Fare']].isna().sum().to_dict())
print("Test:", test[['Age', 'Embarked', 'Fare']].isna().sum().to_dict())

Missing values after imputation:
Train: {'Age': 0, 'Embarked': 0, 'Fare': 0}
Test: {'Age': 0, 'Embarked': 0, 'Fare': 0}


In [4]:
# Prepare features for modeling
def prepare_features(train_df, test_df):
    """Encode categorical features and select final feature set"""
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Encode categorical variables
    le_sex = LabelEncoder()
    le_embarked = LabelEncoder()
    le_title = LabelEncoder()
    
    # Fit on combined data to handle all categories
    combined_sex = pd.concat([train_df['Sex'], test_df['Sex']])
    combined_embarked = pd.concat([train_df['Embarked'], test_df['Embarked']])
    combined_title = pd.concat([train_df['Title'], test_df['Title']])
    
    le_sex.fit(combined_sex)
    le_embarked.fit(combined_embarked)
    le_title.fit(combined_title)
    
    train_df['Sex'] = le_sex.transform(train_df['Sex'])
    test_df['Sex'] = le_sex.transform(test_df['Sex'])
    
    train_df['Embarked'] = le_embarked.transform(train_df['Embarked'])
    test_df['Embarked'] = le_embarked.transform(test_df['Embarked'])
    
    train_df['Title'] = le_title.transform(train_df['Title'])
    test_df['Title'] = le_title.transform(test_df['Title'])
    
    # Select features
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
                'Title', 'FamilySize', 'IsAlone', 'Has_Cabin']
    
    X_train = train_df[features]
    y_train = train_df['Survived']
    X_test = test_df[features]
    
    return X_train, y_train, X_test, features

X_train, y_train, X_test, features = prepare_features(train, test)

print(f"Features: {features}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

Features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Has_Cabin']
X_train shape: (891, 11)
X_test shape: (418, 11)


In [5]:
# Train Random Forest with Stratified 5-Fold CV
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(rf, X_train, y_train, cv=skf, scoring='accuracy')

print(f"CV Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.5f} (+/- {cv_scores.std():.5f})")
print(f"Min: {cv_scores.min():.5f}, Max: {cv_scores.max():.5f}")

CV Scores: [0.84357542 0.82022472 0.8258427  0.83707865 0.84269663]
Mean CV Accuracy: 0.83388 (+/- 0.00931)
Min: 0.82022, Max: 0.84358


In [6]:
# Train on full training data and make predictions
rf.fit(X_train, y_train)

# Feature importance
importance_df = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(importance_df.to_string(index=False))

Feature Importance:
   feature  importance
       Sex    0.297529
     Title    0.192762
      Fare    0.125950
    Pclass    0.087177
       Age    0.083613
 Has_Cabin    0.066232
FamilySize    0.053793
     SibSp    0.042855
  Embarked    0.021727
     Parch    0.017052
   IsAlone    0.011311


In [7]:
# Make predictions on test set
test_predictions = rf.predict(X_test)

# Load original test data for PassengerId
test_original = pd.read_csv('/home/data/test.csv')

# Create submission
submission = pd.DataFrame({
    'PassengerId': test_original['PassengerId'],
    'Survived': test_predictions
})

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"Submission saved with {len(submission)} predictions")
print(submission.head())
print(f"\nPrediction distribution: {pd.Series(test_predictions).value_counts().to_dict()}")

Submission saved with 418 predictions
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1

Prediction distribution: {0: 264, 1: 154}
