In [10]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string

from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from typing import Union
from xgboost import XGBRFClassifier

In [11]:
# Loading CSVs into Pandas DataFrame
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [12]:
# Splitting train_df into Training and Validation sets.
X_train_val = train_df.drop(columns=['Survived'])
y_train_val = train_df['Survived']

# Stratifying to ensure proportion of survivors is same in training and validation
# sets.
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.20, stratify=y_train_val)

# Shared Functions

In [13]:
def evaluate_model(model: BaseEstimator,
                   X_train: pd.DataFrame,
                   y_train: pd.Series,
                   X_val: pd.DataFrame,
                   y_val: pd.Series) -> dict[str: Union[str, float]]:
    """
    Returns accuracy on training and validation sets. 
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_val (pd.DataFrame): Validation features.
    
    Returns:
        (dict[str: Union[str, float]]): dictionary of training and validation
            accuracies for model.
    """
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    
    train_accuracy = accuracy_score(
        y_true=y_train, y_pred=y_pred_train)
    val_accuracy = accuracy_score(
        y_true=y_val, y_pred=y_pred_val)

    return {
        'model': model.estimator,
        'training_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy
    }

# Preprocessing

In [14]:
def get_deck(row: pd.Series) -> str:
    """
    Gets the Deck of a Titanic Ticket.  The Deck of a Titanic Ticket is the first
    Character of the Ticket if there's a space in it.  Else 'Numeric'.

    Args:
        row (pd.Series): Row of titanic training data.

    Returns:
        str: Deck of a Titanic Ticket.
    """
    ticket = row['Ticket']
    if len(ticket.split()) > 1:
        return ticket[0]
    else:
        return 'Numeric'

In [15]:
def get_clean_name_words(row) -> list[str]:
    """
    Listifies, lowercases, and removes punctuation from row['Name'].

    Args:
        row (pd.Series): Row of titanic training data.

    Retruns:
        list[str]: A list of strings.  Each list is each token of row['Name']
        lowercased and stripped of punctuation.
    """
    name = row['Name']
    name_lower = name.lower()
    name_lower_no_punct = name_lower.translate(
        str.maketrans('', '', string.punctuation))
    return name_lower_no_punct.split()

In [16]:
def preprocess(X: pd.DataFrame) -> pd.DataFrame:
    """
    Generates relevant features to predict whether a passenger survived the
    Titanic disaster.
    
    Args:
        X (pd.DataFrame): Features.

    Returns:
        pd.DataFrame: Subset of X with engineered columns defined by features
            where multiclass variables are binarized.
    """  
    X['Deck'] = X.apply(get_deck, axis=1)
    X['Clean_Name_Words'] = X.apply(get_clean_name_words, axis=1)
    
    # Assuming all passengers with null ages are 40.
    X['Age_not_null'] = X['Age'].fillna(40)
    # Assuming all passengers will null Fares paid $0.
    X['Fare_not_null'] = X['Fare'].fillna(0)

    # Creating features based on Cabin.
    X['no_cabin'] = X.apply(lambda row: pd.isnull(row['Cabin']), axis=1)
    X['cabin_first_char'] = X['Cabin'].apply(
        lambda row: row[0] if pd.notnull(row) else row)

    # One hot encoding gender, embarkment location, Deck, and cabin_first_char.
    X = pd.get_dummies(X, columns=['Sex', 'Embarked', 'Deck',
                                     'cabin_first_char'])
    X.columns = ['is_male' if x == 'Sex_male' else x for x in X.columns]

    # Creating indicator features based on Name.
    X['is_married_adult_woman'] = X.apply(
        lambda row: not row['is_male']
        and 'mrs' in row['Clean_Name_Words']
        and row['Age_not_null'] >= 21, axis=1)
    X['is_unmarried_adult_woman'] = X.apply(
        lambda row: not row['is_male']
        and 'miss' in row['Clean_Name_Words']
        and row['Age_not_null'] >= 21, axis=1)
    X['is_master'] = X.apply(lambda row: 'master' in row['Clean_Name_Words'], axis=1)
    X['is_dr']     = X.apply(lambda row: 'dr' in row['Clean_Name_Words'], axis=1)
    X['is_rev']    = X.apply(lambda row: 'rev' in row['Clean_Name_Words'], axis=1)
    return X

# Models
## Gender Model

In [17]:
class GenderModel(BaseEstimator):
    estimator = "GenderModel"
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> "GenderModel":
        """
        Fits a GenderModel for the given training data.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
            y (pd.Series): Boolean target variable.

        Retruns:
            GenderModel: A fit GenderModel to the training data.
        """
        return self
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predicts whether a passenger survives the Titanic solely based on gender.
        Women are predicted to survive, men are not.
    
        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
    
        Returns:
            pd.Series: True if female, False otherwise.
        """
        return np.where(X['Sex'] == 'female', 1, 0)
    def score(self, X: pd.DataFrame, y_true: pd.DataFrame) -> float:
        """
        Returns the accuracy of predicting a GenderModel on X against y.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.

        Returns:
            float: accuracy of predictinging a GenderModel on X against y.
        """
        y_pred = self.predict(X)
        return (y_pred == y_true).mean()

In [18]:
gender_model = GridSearchCV(GenderModel(), param_grid={})

gender_model.fit(X_train, y_train)

## Random Forest
### Feature Selection

In [19]:
features_rf = ["Pclass", "SibSp", "Parch", "is_male", "Embarked_C", "Embarked_Q",
               "Deck_A", "Deck_F", "Deck_P", "Deck_W", "no_cabin",
               "cabin_first_char_A", "cabin_first_char_B", "cabin_first_char_C",
               "cabin_first_char_D", "cabin_first_char_E", "cabin_first_char_F",
               "cabin_first_char_G", "cabin_first_char_T", "Age_not_null",
               "Fare_not_null", "is_married_adult_woman",
               "is_unmarried_adult_woman", "is_master", "is_dr", "is_rev"]

X_train_preprocessed_rf = preprocess(X_train).reindex(
    columns=features_rf, fill_value=0)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_rf = preprocess(X_val).reindex(
    columns=features_rf, fill_value=0)

### Hyperparameter Tuning

In [20]:
# Trying ~80 combinations takes about 2 minutes.
param_grid_rf = [{
    'max_depth': np.arange(start=1, stop=4, step=1),
    'max_features': np.arange(start=1, stop=10, step=1),
    'n_estimators': [100, 150, 200]
}]

start_time = datetime.datetime.now()
random_forest_model = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid_rf,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
random_forest_model.fit(X_train_preprocessed_rf, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"Random Forest:\tGridSearchCV took {gridsearch_time}")

Random Forest:	GridSearchCV took 0:01:39.369523


In [21]:
evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf,
    y_train=y_train, X_val=X_val_preprocessed_rf, y_val=y_val)

{'model': RandomForestClassifier(),
 'training_accuracy': 0.824438202247191,
 'validation_accuracy': 0.8491620111731844}

## XGBoost
### Feature Selection

In [22]:
features_xgb = ["Pclass", "Age", "SibSp", "Parch", "Fare", "is_male",
                "Embarked_C", "Embarked_Q", "Deck_A", "Deck_F", "Deck_P", "Deck_W",
                "no_cabin", "cabin_first_char_A", "cabin_first_char_B",
                "cabin_first_char_C", "cabin_first_char_D", "cabin_first_char_E",
                "cabin_first_char_F", "cabin_first_char_G", "cabin_first_char_T",
                "is_married_adult_woman", "is_unmarried_adult_woman", "is_master",
                "is_dr", "is_rev"]

X_train_preprocessed_xgb = preprocess(X_train).reindex(
    columns=features_xgb, fill_value=0)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_xgb = preprocess(X_val).reindex(
    columns=features_xgb, fill_value=0)

In [35]:
1e-5

1e-05

In [45]:
# TODO: XGBoost is overfitting my training data.
# https://xgboost.readthedocs.io/en/stable/tutorials/categorical.html

# Trying ~100 combinations takes about 1 minute.
param_grid_xgb = [{
    'n_estimators': [200],
    'subsample': np.arange(start=0, stop=0.9, step=.1),
    'colsample_bynode': np.arange(start=0.3, stop=0.7, step=.1),
    'reg_lambda': [1e-1]
}]

start_time = datetime.datetime.now()
xgboost_model = GridSearchCV(
    estimator=XGBRFClassifier(),
    param_grid=param_grid_xgb,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
xgboost_model.fit(X_train_preprocessed_xgb, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"XGBoost:\tGridSearchCV took {gridsearch_time}")

XGBoost:	GridSearchCV took 0:00:20.271069


In [46]:
xgboost_model.best_params_

{'colsample_bynode': 0.5,
 'n_estimators': 200,
 'reg_lambda': 0.1,
 'subsample': 0.8}

In [47]:
evaluate_model(model=xgboost_model, X_train=X_train_preprocessed_xgb,
               y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

{'model': XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bytree=None, device=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, feature_types=None, gamma=None,
                 grow_policy=None, importance_type=None,
                 interaction_constraints=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 multi_strategy=None, n_estimators=None, n_jobs=None,
                 num_parallel_tree=None, objective='binary:logistic',
                 random_state=None, reg_alpha=None, ...),
 'training_accuracy': 0.8792134831460674,
 'validation_accuracy': 0.8491620111731844}

### Leaderboard

In [None]:
# Adding models to the Leaderboard.
gender_model_evaluation = evaluate_model(
    model=gender_model, X_train=X_train, y_train=y_train,
    X_val=X_val, y_val=y_val)

random_forest_evaluation = evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf, y_train=y_train,
    X_val=X_val_preprocessed_rf, y_val=y_val)

xgb_evaluation = evaluate_model(
    model=xgboost_model, X_train=X_train_preprocessed_xgb,
    y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

In [None]:
leaderboard = pd.DataFrame([
gender_model_evaluation, random_forest_evaluation, xgb_evaluation])

leaderboard.sort_values(by='validation_accuracy', ascending=False, inplace=True)

leaderboard.head(len(leaderboard))

# Submission

In [None]:
# Predicting on X_test.
X_test_preprocessed_xgb = preprocess(X_test).reindex(
    columns=features_xgb, fill_value=0)
y_pred_test = xgboost_model.predict(X_test_preprocessed_xgb)

submission_df = pd.DataFrame({
    "PassengerId": X_test.PassengerId,
    "Survived": y_pred_test
})

submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")