In [1]:
# TODO: Do a better job imputing missing values.
# https://www.kaggle.com/code/allohvk/titanic-missing-age-imputation-tutorial-advanced
# https://scikit-learn.org/stable/modules/impute.html
# $0 Fares could be for crew members.
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string

from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from typing import Union
from xgboost import XGBClassifier

In [2]:
# Loading CSVs into Pandas DataFrame
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')

# Preprocessing
## Creating Training and Validation Sets

In [3]:
y_train_val = train_df.Survived
X_train_val = train_df.drop('Survived', axis=1)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, stratify=y_train_val)

## Creating Features

In [4]:
import re

def get_title(row):
    # TODO: docstring
    # the title is the first word before the period
    pattern = r'\b(\w+)\.'
    match = re.search(pattern, row['Name'])
    if match:
        return match.group(1)
    return None

In [5]:
def get_title_group(row):
    # TODO: docstring.  The point of the title group is to infer age.
    title_group_dict = {
        'Mr':       'grown_man',
        'Miss':     'unmarried_female',
        'Mrs':      'married_female',
        'Master':   'child_male',
        'Rev': 	    'nobility',
        'Dr': 	    'nobility',
        'Col':      'grown_man',
        'Mlle':     'unmarried_female',
        'Capt':     'nobility',
        'Countess': 'unmarried_female',
        'Don': 	    'nobility',
        'Jonkheer': 'nobility',
        'Major': 	'nobility',
        'Mme': 	    'married_female',
        'Ms':       'unmarried_female',
        'Lady':     'married_female',
        'Sir':      'grown_man',
        'Dona':     'unmarried_female'
    }
    return title_group_dict[row['title']]

In [6]:
def impute_age(row, age_groups):
    # TODO: harden and include title
    if pd.isnull(row['Age']):
        # Get the mean age of passengers from the same gender and Pclass
        return age_groups.loc[(
            row['Sex'], row['Pclass'], row['title_group']),
        'Age']
    else:
        return row['Age']

In [7]:
def preprocess(X: pd.DataFrame) -> pd.DataFrame:
    """
    Generates relevant features to predict whether a passenger survived the
    Titanic disaster.
    
    Args:
        X (pd.DataFrame): Features.

    Returns:
        pd.DataFrame: Subset of X with engineered columns defined by features
            where multiclass variables are binarized.
    """
    # Making a deep copy of X to not alter our original training data.
    X_preprocessed = X.copy()

    X_preprocessed['is_male'] = X_preprocessed['Sex'] == 'male'
    X_preprocessed['is_child'] = X_preprocessed['Age'] < 18

    # TODO: comments
    X_preprocessed['title'] = X_preprocessed.apply(get_title, axis=1)
    X_preprocessed['title_group'] = X_preprocessed.apply(get_title_group, axis=1)

    # TODO pass age_groups to impute_age
    age_groups = X_preprocessed.groupby(['Sex', 'Pclass', 'title_group']).agg({
        'Age': 'mean'
    })
    
    # TODO: Impute age based on Pclass, Title (as it contains info on age)
    X_preprocessed['Age_not_null'] = X_preprocessed.apply(
        lambda row: impute_age(row, age_groups), axis=1)

    # Assuming all passengers will null Fares paid $0.
    X_preprocessed['Fare_not_null'] = X_preprocessed['Fare'].fillna(0)

    # Creating features based on Cabin.
    X_preprocessed['no_cabin'] = X_preprocessed.apply(
        lambda row: pd.isnull(row['Cabin']), axis=1)

    # The size of a passenger's family is the number of siblings plus the number of
    # parents plus themselves.
    X_preprocessed['num_family_members'] = X_preprocessed['SibSp']+X_preprocessed['Parch']+1
    X_preprocessed['family_size'] = pd.cut(
        x=X_preprocessed['num_family_members'],
        bins=[0, 2, 5, float('inf')],
        labels=['alone', 'small', 'big'],
        right=False
    )

    # Creating one hot encodings.
    X_preprocessed = pd.get_dummies(X_preprocessed, columns=[
        'Embarked', 'family_size', 'Pclass'])
    return X_preprocessed

# Models

In [8]:
def evaluate_model(model: BaseEstimator,
                   X_train: pd.DataFrame,
                   y_train: pd.Series,
                   X_val: pd.DataFrame,
                   y_val: pd.Series) -> dict[str: Union[str, float]]:
    """
    Returns accuracy on training and validation sets. 
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_val (pd.DataFrame): Validation features.
    
    Returns:
        (dict[str: Union[str, float]]): dictionary of training and validation
            accuracies for model.
    """
    y_pred_train = model.predict(X_train).round()
    y_pred_val = model.predict(X_val).round()

    train_accuracy = accuracy_score(
        y_true=y_train, y_pred=y_pred_train)
    val_accuracy = accuracy_score(
        y_true=y_val, y_pred=y_pred_val)

    return {
        'model': model.estimator,
        'training_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy
    }

## Gender Model

In [9]:
class GenderModel(BaseEstimator):
    estimator = "GenderModel"
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> "GenderModel":
        """
        Fits a GenderModel for the given training data.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
            y (pd.Series): Boolean target variable.

        Retruns:
            GenderModel: A fit GenderModel to the training data.
        """
        return self
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predicts whether a passenger survives the Titanic solely based on gender.
        Women are predicted to survive, men are not.
    
        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
    
        Returns:
            pd.Series: True if female, False otherwise.
        """
        return np.where(X['Sex'] == 'female', 1, 0)
    def score(self, X: pd.DataFrame, y_true: pd.DataFrame) -> float:
        """
        Returns the accuracy of predicting a GenderModel on X against y.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.

        Returns:
            float: accuracy of predictinging a GenderModel on X against y.
        """
        y_pred = self.predict(X)
        return (y_pred == y_true).mean()

In [10]:
gender_model = GridSearchCV(GenderModel(), param_grid={})

gender_model.fit(X_train, y_train)

## Random Forest
### Feature Selection

In [11]:
features_rf = ["is_male", "Age_not_null", "Fare_not_null", "Pclass_1", "Pclass_2",
               "Pclass_3"]

X_train_preprocessed_rf = preprocess(X_train).reindex(columns=features_rf)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_rf = preprocess(X_val).reindex(columns=features_rf)

### Hyperparameter Tuning

In [12]:
# Trying ~10 combinations takes about 10 seconds.
param_grid_rf = [{
    'max_depth': np.arange(start=2, stop=3, step=1),
    'max_features': np.arange(start=2, stop=4, step=1),
    'n_estimators': [150, 200, 250]
}]

start_time = datetime.datetime.now()
random_forest_model = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid_rf,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
random_forest_model.fit(X_train_preprocessed_rf, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"Random Forest:\tGridSearchCV took {gridsearch_time}")

Random Forest:	GridSearchCV took 0:00:10.042719


In [13]:
evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf,
    y_train=y_train, X_val=X_val_preprocessed_rf, y_val=y_val)

{'model': RandomForestClassifier(),
 'training_accuracy': 0.7724550898203593,
 'validation_accuracy': 0.8251121076233184}

## XGBoost
### Feature Selection

In [14]:
features_xgb = ['Age', 'Fare', 'Pclass_2', 'Pclass_1', 'Pclass_3',
                'family_size_alone', 'family_size_big', 'family_size_small',
                'is_male', 'no_cabin']

X_train_preprocessed_xgb = preprocess(X_train).reindex(columns=features_xgb)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_xgb = preprocess(X_val).reindex(columns=features_xgb)

In [15]:
# Trying ~100 combinations takes about 1 minute.
param_grid_xgb = [{
    'n_estimators': [150],
    'max_depth': [3],
    'subsample': [0.6],
    'colsample_bytree': [0.5],
    'gamma': [2, 3],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [10]
}]

start_time = datetime.datetime.now()
xgboost_model = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=param_grid_xgb,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
xgboost_model.fit(X_train_preprocessed_xgb, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"XGBoost:\tGridSearchCV took {gridsearch_time}")

XGBoost:	GridSearchCV took 0:00:00.988455


In [16]:
evaluate_model(model=xgboost_model, X_train=X_train_preprocessed_xgb,
               y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

{'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
 'training_accuracy': 0.8278443113772455,
 'validation_accuracy': 0.8071748878923767}

## Neural Net
### Feature Selection

In [17]:
features_nn = ['Age_not_null', 'Fare_not_null', 'Pclass_2', 'Pclass_1',
               'Pclass_3', 'family_size_alone', 'family_size_big',
               'family_size_small', 'is_child', 'is_male', 'no_cabin']

X_train_preprocessed_nn = preprocess(X_train).reindex(columns=features_nn)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_nn = preprocess(X_val).reindex(columns=features_nn)

In [18]:
# Scale Features.
numeric_features_nn = ['Age_not_null', 'Fare_not_null']

# Initialize scaler.
scaler_nn = StandardScaler()
# Fit scaler on training data.
scaler_nn.fit(X_train_preprocessed_nn[numeric_features_nn])

# Scale only specified columns.
X_train_preprocessed_nn[numeric_features_nn] = scaler_nn.transform(
    X_train_preprocessed_nn[numeric_features_nn])
X_val_preprocessed_nn[numeric_features_nn] = scaler_nn.transform(
    X_val_preprocessed_nn[numeric_features_nn])

### Training

In [19]:
# Sequential model with 5 layers
nn_model = Sequential(
    [
        layers.Input(shape=((len(features_nn),))),
        layers.Dense(512, activation="relu", name="layer1"),
        layers.Dropout(0.2),
        layers.Dense(512, activation="relu", name="layer2"),
        layers.Dropout(0.3),
        layers.Dense(256, activation="relu", name="layer3"),
        layers.Dropout(0.3),
        layers.Dense(128, activation="relu", name="layer4"),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid', name="layer5"),
    ]
)

nn_model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-5),
    metrics=['accuracy']
)

# Setting estimator attribute for Leaderboard
nn_model.estimator = "NeuralNet"

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=15,
    restore_best_weights=True
)

# Fit the model with early stopping
history = nn_model.fit(
    X_train_preprocessed_nn, y_train,
    validation_data=(X_val_preprocessed_nn, y_val),
    epochs=100,
    batch_size=16,
    callbacks=[early_stopping]
)

Epoch 1/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.4940 - loss: 0.6892 - val_accuracy: 0.6996 - val_loss: 0.6744
Epoch 2/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5615 - loss: 0.6827 - val_accuracy: 0.6906 - val_loss: 0.6638
Epoch 3/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6461 - loss: 0.6716 - val_accuracy: 0.6951 - val_loss: 0.6544
Epoch 4/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6860 - loss: 0.6568 - val_accuracy: 0.6951 - val_loss: 0.6458
Epoch 5/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6753 - loss: 0.6537 - val_accuracy: 0.6861 - val_loss: 0.6374
Epoch 6/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6563 - loss: 0.6507 - val_accuracy: 0.6816 - val_loss: 0.6291
Epoch 7/100
[1m42/42[0m [32m━━

In [20]:
evaluate_model(model=nn_model, X_train=X_train_preprocessed_nn, y_train=y_train,
               X_val=X_val_preprocessed_nn, y_val=y_val)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


{'model': 'NeuralNet',
 'training_accuracy': 0.8188622754491018,
 'validation_accuracy': 0.8295964125560538}

## Logistic Regression
### Feature Selection

In [21]:
features_lr = ["Age_not_null", "is_male", "Fare_not_null", "Pclass_1",
               "Pclass_2", "Pclass_3", "no_cabin", "family_size_alone",
               "family_size_small", "family_size_big"]

X_train_preprocessed_lr = preprocess(X_train).reindex(columns=features_lr)
X_val_preprocessed_lr = preprocess(X_val).reindex(columns=features_lr)

In [22]:
# Scale Features.
numeric_features_lr = ['Age_not_null', 'Fare_not_null']

# Initialize scaler.
scaler_lr = StandardScaler()
# Fit scaler on training data.
scaler_lr.fit(X_train_preprocessed_lr[numeric_features_lr])

# Scale only specified columns.
X_train_preprocessed_lr[numeric_features_lr] = scaler_lr.transform(
    X_train_preprocessed_lr[numeric_features_lr])
X_val_preprocessed_lr[numeric_features_lr] = scaler_lr.transform(
    X_val_preprocessed_lr[numeric_features_lr])

### Hyperparameter Tuning

In [23]:
# Trying ~15 combinations takes about 1 second.
param_grid_lr = [{
    'penalty': ['l2'],
    'solver': ['newton-cholesky', 'liblinear'],
    'C': np.arange(start=.2, stop=.9, step=.1)
}]

start_time = datetime.datetime.now()
lr_model = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid_lr,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
lr_model.fit(X_train_preprocessed_lr, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"Logistic Regression:\tGridSearchCV took {gridsearch_time}")

Logistic Regression:	GridSearchCV took 0:00:00.526888


In [24]:
evaluate_model(model=lr_model, X_train=X_train_preprocessed_lr, y_train=y_train,
               X_val=X_val_preprocessed_lr, y_val=y_val)

{'model': LogisticRegression(),
 'training_accuracy': 0.8263473053892215,
 'validation_accuracy': 0.8161434977578476}

# Leaderboard

In [25]:
# Adding models to the Leaderboard.
gender_model_evaluation = evaluate_model(
    model=gender_model, X_train=X_train, y_train=y_train,
    X_val=X_val, y_val=y_val)

random_forest_evaluation = evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf, y_train=y_train,
    X_val=X_val_preprocessed_rf, y_val=y_val)

xgb_evaluation = evaluate_model(
    model=xgboost_model, X_train=X_train_preprocessed_xgb,
    y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

nn_evaluation = evaluate_model(
    model=nn_model, X_train=X_train_preprocessed_nn,
    y_train=y_train, X_val=X_val_preprocessed_nn, y_val=y_val)

lr_evaluation = evaluate_model(
    model=lr_model, X_train=X_train_preprocessed_lr,
    y_train=y_train, X_val=X_val_preprocessed_lr, y_val=y_val)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [26]:
leaderboard = pd.DataFrame([
    gender_model_evaluation, random_forest_evaluation, xgb_evaluation,
    nn_evaluation, lr_evaluation])

leaderboard.sort_values(by='validation_accuracy', ascending=False, inplace=True)

leaderboard.head(len(leaderboard))

Unnamed: 0,model,training_accuracy,validation_accuracy
3,NeuralNet,0.818862,0.829596
1,RandomForestClassifier(),0.772455,0.825112
4,LogisticRegression(),0.826347,0.816143
0,GenderModel(),0.77994,0.807175
2,"XGBClassifier(base_score=None, booster=None, c...",0.827844,0.807175


# Submission

In [27]:
# Predicting on X_test.
X_test_preprocessed_lr = preprocess(X_test).reindex(columns=features_lr)
X_test_preprocessed_lr[numeric_features_lr] = scaler_lr.transform(
    X_test_preprocessed_lr[numeric_features_lr])
y_pred_test = lr_model.predict(X_test_preprocessed_lr)

In [28]:
submission_df = pd.DataFrame({
    "PassengerId": X_test.PassengerId,
    "Survived": y_pred_test
})

submission_df.head(50)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [29]:
submission_df.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")

Submission saved to submission.csv
