In [1]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string

from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from typing import Union
from xgboost import XGBClassifier

In [2]:
# Loading CSVs into Pandas DataFrame
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')

# Preprocessing

In [3]:
def get_is_male(row: pd.Series) -> bool:
    """
    Creates boolen indicator value for men in Titanic data.

    Args:
        row (pd.Series): A row of Titanic data containing a 'Sex' column.

    Returns:
        Bool: True if Sex=male, False otherwise.
    """
    return row['Sex'] == 'male'

In [4]:
def get_is_child(row: pd.Series) -> bool:
    """
    Creates boolen indicator value for childrean in Titanic data.  Defines a child
    to be <18.

    Args:
        row (pd.Series): A row of Titanic data containing a 'Age' column.

    Returns:
        Bool: True if Age<18, False otherwise.
    """
    return row['Age'] < 18

In [5]:
def get_is_male_is_child(row: pd.Series) -> str:
    """
    Creates compositie indicator variable for validation set creation.  For this
    problem, we have the features for the test set.  The proportions of women and
    children in the training set is different than that in the test set.  This
    creates lower model preformance on the test features than the validation
    features.  We use this indicator to create a custom validation set based on the
    proportions of women and children in the test set.
    
    Args:
        row (pd.Series): A row of Titanic data containing 'is_male', and 'is_child'
        columns.
        
    Returns:
        str: Concatenated is_male and is_child columns from row.
    """
    return f"{row['is_male']}_{row['is_child']}"

## Creating Training and Validation Sets

In [6]:
train_df['is_male'] = train_df.apply(get_is_male, axis=1)
X_test['is_male'] = X_test.apply(get_is_male, axis=1)

train_df['is_child'] = train_df.apply(get_is_child, axis=1)
X_test['is_child'] = X_test.apply(get_is_child, axis=1)

train_df['is_male_is_child'] = train_df.apply(get_is_male_is_child, axis=1)
X_test['is_male_is_child'] = X_test.apply(get_is_male_is_child, axis=1)

In [7]:
# Creating X_val based on the proportion of women and children in X_test
val_df = pd.DataFrame()

test_proxy_proportions = X_test["is_male_is_child"].value_counts(normalize=True)
for proxy_group, proportion in test_proxy_proportions.items():
    group_data = train_df[train_df['is_male_is_child'] == proxy_group]
    n_samples = int(proportion * len(train_df) * 0.20)  # 20% for validation set
    # Sample without replacement from the group
    sampled_group_data = group_data.sample(n=n_samples, replace=False)
    val_df = pd.concat([val_df, sampled_group_data])

# Drop validation samples from training data to avoid data leakage.
train_df = train_df.drop(val_df.index)

In [8]:
# Separating features and targets in training and validation sets.
y_train = train_df.Survived
X_train = train_df.drop(columns=['Survived'])

y_val = val_df.Survived
X_val = val_df.drop(columns=['Survived'])

## Creating Features

In [9]:
def get_deck(row: pd.Series) -> str:
    """
    Gets the Deck of a Titanic Ticket.  The Deck of a Titanic Ticket is the first
    Character of the Ticket if there's a space in it.  Else 'Numeric'.

    Args:
        row (pd.Series): Row of titanic training data.

    Returns:
        str: Deck of a Titanic Ticket.
    """
    ticket = row['Ticket']
    if len(ticket.split()) > 1:
        return ticket[0]
    else:
        return 'Numeric'

In [10]:
def preprocess(X: pd.DataFrame) -> pd.DataFrame:
    """
    Generates relevant features to predict whether a passenger survived the
    Titanic disaster.
    
    Args:
        X (pd.DataFrame): Features.

    Returns:
        pd.DataFrame: Subset of X with engineered columns defined by features
            where multiclass variables are binarized.
    """
    # Making a deep copy of X to not alter our original training data.
    X_preprocessed = X.copy()

    X_preprocessed['Deck'] = X_preprocessed.apply(get_deck, axis=1)

    # Assuming all passengers with null ages are 40.
    X_preprocessed['Age_not_null'] = X_preprocessed['Age'].fillna(40)
    # Assuming all passengers will null Fares paid $0.
    X_preprocessed['Fare_not_null'] = X_preprocessed['Fare'].fillna(0)

    # Creating features based on Cabin.
    X_preprocessed['no_cabin'] = X_preprocessed.apply(
        lambda row: pd.isnull(row['Cabin']), axis=1)
    X_preprocessed['cabin_first_char'] = X_preprocessed['Cabin'].apply(
        lambda row: row[0] if pd.notnull(row) else row)

    # The size of a passenger's family is the number of siblings plus the number of
    # parents plus themselves.
    X_preprocessed['num_family_members'] = X_preprocessed['SibSp']+X_preprocessed['Parch']+1
    X_preprocessed['family_size'] = pd.cut(
        x=X_preprocessed['num_family_members'],
        bins=[0, 2, 5, float('inf')],
        labels=['alone', 'small', 'big'],
        right=False
    )

    # One hot encoding gender, embarkment location, Deck, cabin_first_char, and
    # family_size.
    X_preprocessed = pd.get_dummies(X_preprocessed, columns=[
        'cabin_first_char', 'Deck', 'Embarked', 'family_size', 'Pclass'])
    return X_preprocessed

# Models

In [11]:
def evaluate_model(model: BaseEstimator,
                   X_train: pd.DataFrame,
                   y_train: pd.Series,
                   X_val: pd.DataFrame,
                   y_val: pd.Series) -> dict[str: Union[str, float]]:
    """
    Returns accuracy on training and validation sets. 
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_val (pd.DataFrame): Validation features.
    
    Returns:
        (dict[str: Union[str, float]]): dictionary of training and validation
            accuracies for model.
    """
    y_pred_train = model.predict(X_train).round()
    y_pred_val = model.predict(X_val).round()

    train_accuracy = accuracy_score(
        y_true=y_train, y_pred=y_pred_train)
    val_accuracy = accuracy_score(
        y_true=y_val, y_pred=y_pred_val)

    return {
        'model': model.estimator,
        'training_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy
    }

## Gender Model

In [12]:
class GenderModel(BaseEstimator):
    estimator = "GenderModel"
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> "GenderModel":
        """
        Fits a GenderModel for the given training data.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
            y (pd.Series): Boolean target variable.

        Retruns:
            GenderModel: A fit GenderModel to the training data.
        """
        return self
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predicts whether a passenger survives the Titanic solely based on gender.
        Women are predicted to survive, men are not.
    
        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
    
        Returns:
            pd.Series: True if female, False otherwise.
        """
        return np.where(X['Sex'] == 'female', 1, 0)
    def score(self, X: pd.DataFrame, y_true: pd.DataFrame) -> float:
        """
        Returns the accuracy of predicting a GenderModel on X against y.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.

        Returns:
            float: accuracy of predictinging a GenderModel on X against y.
        """
        y_pred = self.predict(X)
        return (y_pred == y_true).mean()

In [13]:
gender_model = GridSearchCV(GenderModel(), param_grid={})

gender_model.fit(X_train, y_train)

## Random Forest
### Feature Selection

In [14]:
features_rf = ["is_male", "Age_not_null", "Fare_not_null", "Pclass_1", "Pclass_2",
               "Pclass_3"]

X_train_preprocessed_rf = preprocess(X_train).reindex(columns=features_rf)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_rf = preprocess(X_val).reindex(columns=features_rf)

### Hyperparameter Tuning

In [15]:
# Trying ~10 combinations takes about 10 seconds.
param_grid_rf = [{
    'max_depth': np.arange(start=2, stop=3, step=1),
    'max_features': np.arange(start=2, stop=4, step=1),
    'n_estimators': [150, 200, 250]
}]

start_time = datetime.datetime.now()
random_forest_model = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid_rf,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
random_forest_model.fit(X_train_preprocessed_rf, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"Random Forest:\tGridSearchCV took {gridsearch_time}")

Random Forest:	GridSearchCV took 0:00:09.829292


In [16]:
evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf,
    y_train=y_train, X_val=X_val_preprocessed_rf, y_val=y_val)

{'model': RandomForestClassifier(),
 'training_accuracy': 0.773109243697479,
 'validation_accuracy': 0.7909604519774012}

## XGBoost
### Feature Selection

In [17]:
features_xgb = ['Age', 'Fare', 'Pclass_2', 'Pclass_1', 'Pclass_3',
                'family_size_alone', 'family_size_big', 'family_size_small',
                'is_child', 'is_male', 'no_cabin']

X_train_preprocessed_xgb = preprocess(X_train).reindex(columns=features_xgb)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_xgb = preprocess(X_val).reindex(columns=features_xgb)

In [18]:
# Trying ~100 combinations takes about 1 minute.
param_grid_xgb = [{
    'n_estimators': [150],
    'max_depth': [3],
    'subsample': [0.6],
    'colsample_bytree': [0.5],
    'gamma': [2, 3],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [10]
}]

start_time = datetime.datetime.now()
xgboost_model = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=param_grid_xgb,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
xgboost_model.fit(X_train_preprocessed_xgb, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"XGBoost:\tGridSearchCV took {gridsearch_time}")

XGBoost:	GridSearchCV took 0:00:01.020474


In [19]:
evaluate_model(model=xgboost_model, X_train=X_train_preprocessed_xgb,
               y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

{'model': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
 'training_accuracy': 0.8403361344537815,
 'validation_accuracy': 0.8248587570621468}

## Neural Net

In [20]:
features_nn = ['Age_not_null', 'Fare_not_null', 'Pclass_2', 'Pclass_1',
               'Pclass_3', 'family_size_alone', 'family_size_big',
               'family_size_small', 'is_child', 'is_male', 'no_cabin']

X_train_preprocessed_nn = preprocess(X_train).reindex(columns=features_nn)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_nn = preprocess(X_val).reindex(columns=features_nn)

In [21]:
# Scale Features.
numeric_features_nn = ['Age_not_null', 'Fare_not_null']

# Initialize scaler.
scaler_nn = StandardScaler()
# Fit scaler on training data.
scaler_nn.fit(X_train_preprocessed_nn[numeric_features_nn])

# Scale only specified columns.
X_train_preprocessed_nn[numeric_features_nn] = scaler_nn.transform(
    X_train_preprocessed_nn[numeric_features_nn])
X_val_preprocessed_nn[numeric_features_nn] = scaler_nn.transform(
    X_val_preprocessed_nn[numeric_features_nn])

In [22]:
# Sequential model with 5 layers
nn_model = Sequential(
    [
        layers.Input(shape=((len(features_nn),))),
        layers.Dense(512, activation="relu", name="layer1"),
        layers.Dropout(0.2),
        layers.Dense(512, activation="relu", name="layer2"),
        layers.Dropout(0.3),
        layers.Dense(256, activation="relu", name="layer3"),
        layers.Dropout(0.3),
        layers.Dense(128, activation="relu", name="layer4"),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid', name="layer5"),
    ]
)

nn_model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=1e-5),
    metrics=['accuracy']
)

# Setting estimator attribute for Leaderboard
nn_model.estimator = "NeuralNet"

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=15,
    restore_best_weights=True
)

# Fit the model with early stopping
history = nn_model.fit(
    X_train_preprocessed_nn, y_train,
    validation_data=(X_val_preprocessed_nn, y_val),
    epochs=100,
    batch_size=16,
    callbacks=[early_stopping]
)

Epoch 1/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5650 - loss: 0.6814 - val_accuracy: 0.6610 - val_loss: 0.6724
Epoch 2/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5963 - loss: 0.6712 - val_accuracy: 0.6610 - val_loss: 0.6603
Epoch 3/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6250 - loss: 0.6675 - val_accuracy: 0.6610 - val_loss: 0.6494
Epoch 4/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6171 - loss: 0.6592 - val_accuracy: 0.6723 - val_loss: 0.6388
Epoch 5/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5980 - loss: 0.6559 - val_accuracy: 0.6723 - val_loss: 0.6286
Epoch 6/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6480 - loss: 0.6377 - val_accuracy: 0.6780 - val_loss: 0.6182
Epoch 7/100
[1m45/45[0m [32m━━

## Logistic Regression

In [23]:
features_lr = ["Age_not_null", "is_male", "Fare_not_null", "Pclass_1",
               "Pclass_2", "Pclass_3", "no_cabin", "family_size_alone",
               "family_size_small", "family_size_big"]

X_train_preprocessed_lr = preprocess(X_train).reindex(columns=features_lr)
X_val_preprocessed_lr = preprocess(X_val).reindex(columns=features_lr)

In [24]:
# Scale Features.
numeric_features_lr = ['Age_not_null', 'Fare_not_null']

# Initialize scaler.
scaler_lr = StandardScaler()
# Fit scaler on training data.
scaler_lr.fit(X_train_preprocessed_lr[numeric_features_lr])

# Scale only specified columns.
X_train_preprocessed_lr[numeric_features_lr] = scaler_lr.transform(
    X_train_preprocessed_lr[numeric_features_lr])
X_val_preprocessed_lr[numeric_features_lr] = scaler_lr.transform(
    X_val_preprocessed_lr[numeric_features_lr])

In [25]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_preprocessed_lr, y_train)

lr_model.estimator = "LogisticRegression"

In [26]:
evaluate_model(model=lr_model, X_train=X_train_preprocessed_lr, y_train=y_train,
               X_val=X_val_preprocessed_lr, y_val=y_val)

{'model': 'LogisticRegression',
 'training_accuracy': 0.8137254901960784,
 'validation_accuracy': 0.8135593220338984}

# Leaderboard

In [27]:
# Adding models to the Leaderboard.
gender_model_evaluation = evaluate_model(
    model=gender_model, X_train=X_train, y_train=y_train,
    X_val=X_val, y_val=y_val)

random_forest_evaluation = evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf, y_train=y_train,
    X_val=X_val_preprocessed_rf, y_val=y_val)

xgb_evaluation = evaluate_model(
    model=xgboost_model, X_train=X_train_preprocessed_xgb,
    y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

nn_evaluation = evaluate_model(
    model=nn_model, X_train=X_train_preprocessed_nn,
    y_train=y_train, X_val=X_val_preprocessed_nn, y_val=y_val)

lr_evaluation = evaluate_model(
    model=lr_model, X_train=X_train_preprocessed_lr,
    y_train=y_train, X_val=X_val_preprocessed_lr, y_val=y_val)

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [28]:
leaderboard = pd.DataFrame([
    gender_model_evaluation, random_forest_evaluation, xgb_evaluation,
    nn_evaluation, lr_evaluation])

leaderboard.sort_values(by='validation_accuracy', ascending=False, inplace=True)

leaderboard.head(len(leaderboard))

Unnamed: 0,model,training_accuracy,validation_accuracy
2,"XGBClassifier(base_score=None, booster=None, c...",0.840336,0.824859
3,NeuralNet,0.827731,0.819209
4,LogisticRegression,0.813725,0.813559
1,RandomForestClassifier(),0.773109,0.79096
0,GenderModel(),0.787115,0.785311


# Submission

In [29]:
# Predicting on X_test.
X_test_preprocessed_xgb = preprocess(X_test).reindex(
    columns=features_xgb, fill_value=0)
y_pred_test = xgboost_model.predict(X_test_preprocessed_xgb)

submission_df = pd.DataFrame({
    "PassengerId": X_test.PassengerId,
    "Survived": y_pred_test
})

submission_df.head(50)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [30]:
submission_df.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")

Submission saved to submission.csv
