In [24]:
# TODO: https://datascience.stackexchange.com/questions/35504/titanic-kaggle-data-why-am-i-getting-lower-accuracy-on-kaggle-submissions-than
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string

from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from typing import Union
from xgboost import XGBRFClassifier

In [25]:
# Loading CSVs into Pandas DataFrame
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
X_test = pd.read_csv('/kaggle/input/titanic/test.csv')

# Shared Functions

In [26]:
def evaluate_model(model: BaseEstimator,
                   X_train: pd.DataFrame,
                   y_train: pd.Series,
                   X_val: pd.DataFrame,
                   y_val: pd.Series) -> dict[str: Union[str, float]]:
    """
    Returns accuracy on training and validation sets. 
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_val (pd.DataFrame): Validation features.
    
    Returns:
        (dict[str: Union[str, float]]): dictionary of training and validation
            accuracies for model.
    """
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    
    train_accuracy = accuracy_score(
        y_true=y_train, y_pred=y_pred_train)
    val_accuracy = accuracy_score(
        y_true=y_val, y_pred=y_pred_val)

    return {
        'model': model.estimator,
        'training_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy
    }

# Preprocessing

In [27]:
def get_is_male(row: pd.Series) -> int:
    # TODO: docstring
    return row['Sex'] == 'male'

In [28]:
def get_is_child(row: pd.Series) -> int:
    # TODO: docstring
    # Defining a child to be < 18.
    return row['Age'] < 18

In [29]:
# Creating X_train and X_val based on the proportion of women and children in
# X_test
train_df['is_male'] = train_df.apply(get_is_male, axis=1)
X_test['is_male'] = X_test.apply(get_is_male, axis=1)
# Defining a child to be < 18.
train_df['is_child'] = train_df.apply(get_is_child, axis=1)
X_test['is_child'] = X_test.apply(get_is_child, axis=1)

train_df['is_male_is_child'] = train_df['is_male'].astype(str) + '_' + train_df['is_child'].astype(str)
X_test['is_male_is_child'] = X_test['is_male'].astype(str) + '_' + X_test['is_child'].astype(str)

In [30]:
# Initialize an empty DataFrame for the validation set
val_df = pd.DataFrame()

test_proxy_proportions = X_test['is_male_is_child'].value_counts(normalize=True)
# Iterate over each proxy group in the test set proportions
for proxy_group, proportion in test_proxy_proportions.items():
    # Filter training data for the current proxy group
    group_data = train_df[train_df['is_male_is_child'] == proxy_group]
    # Calculate the number of samples needed for this group in the validation set
    n_samples = int(proportion * len(train_df) * 0.25)  # 25% for validation set
    # Sample without replacement from the group
    sampled_group_data = group_data.sample(n=n_samples, random_state=42, replace=False)
    # Append sampled data to the validation set
    val_df = pd.concat([val_df, sampled_group_data])

# Drop validation samples from training data to avoid data leakage
train_df = train_df.drop(val_df.index)

In [31]:
y_train = train_df.Survived
X_train = train_df.drop(columns=['Survived'])
y_val = val_df.Survived
X_val = val_df.drop(columns=['Survived'])

In [32]:
def get_deck(row: pd.Series) -> str:
    """
    Gets the Deck of a Titanic Ticket.  The Deck of a Titanic Ticket is the first
    Character of the Ticket if there's a space in it.  Else 'Numeric'.

    Args:
        row (pd.Series): Row of titanic training data.

    Returns:
        str: Deck of a Titanic Ticket.
    """
    ticket = row['Ticket']
    if len(ticket.split()) > 1:
        return ticket[0]
    else:
        return 'Numeric'

In [33]:
def preprocess(X: pd.DataFrame) -> pd.DataFrame:
    """
    Generates relevant features to predict whether a passenger survived the
    Titanic disaster.
    
    Args:
        X (pd.DataFrame): Features.

    Returns:
        pd.DataFrame: Subset of X with engineered columns defined by features
            where multiclass variables are binarized.
    """
    # Making a deep copy of X to not alter our original training data.
    X_preprocessed = X.copy()

    X_preprocessed['Deck'] = X_preprocessed.apply(get_deck, axis=1)

    # Assuming all passengers with null ages are 40.
    X_preprocessed['Age_not_null'] = X_preprocessed['Age'].fillna(40)
    # Assuming all passengers will null Fares paid $0.
    X_preprocessed['Fare_not_null'] = X_preprocessed['Fare'].fillna(0)

    # Creating features based on Cabin.
    X_preprocessed['no_cabin'] = X_preprocessed.apply(
        lambda row: pd.isnull(row['Cabin']), axis=1)
    X_preprocessed['cabin_first_char'] = X_preprocessed['Cabin'].apply(
        lambda row: row[0] if pd.notnull(row) else row)

    # One hot encoding gender, embarkment location, Deck, and cabin_first_char.
    X_preprocessed = pd.get_dummies(X_preprocessed, columns=[
        'Embarked', 'Deck', 'cabin_first_char'])
    return X_preprocessed

# Models
## Gender Model

In [34]:
class GenderModel(BaseEstimator):
    estimator = "GenderModel"
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> "GenderModel":
        """
        Fits a GenderModel for the given training data.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
            y (pd.Series): Boolean target variable.

        Retruns:
            GenderModel: A fit GenderModel to the training data.
        """
        return self
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predicts whether a passenger survives the Titanic solely based on gender.
        Women are predicted to survive, men are not.
    
        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.
    
        Returns:
            pd.Series: True if female, False otherwise.
        """
        return np.where(X['Sex'] == 'female', 1, 0)
    def score(self, X: pd.DataFrame, y_true: pd.DataFrame) -> float:
        """
        Returns the accuracy of predicting a GenderModel on X against y.

        Args:
            X (pd.DataFrame): Pandas Dataframe containing 'Sex' column.

        Returns:
            float: accuracy of predictinging a GenderModel on X against y.
        """
        y_pred = self.predict(X)
        return (y_pred == y_true).mean()

In [35]:
gender_model = GridSearchCV(GenderModel(), param_grid={})

gender_model.fit(X_train, y_train)

## Random Forest
### Feature Selection

In [36]:
features_rf = ["is_male", "Fare_not_null", "no_cabin", "Pclass", "Age_not_null"]

X_train_preprocessed_rf = preprocess(X_train).reindex(
    columns=features_rf, fill_value=0)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_rf = preprocess(X_val).reindex(
    columns=features_rf, fill_value=0)

### Hyperparameter Tuning

In [37]:
# Trying ~30 combinations takes about 45 seconds.
param_grid_rf = [{
    'max_depth': np.arange(start=1, stop=4, step=1),
    'max_features': np.arange(start=1, stop=4, step=1),
    'n_estimators': [100, 150, 200, 250]
}]

start_time = datetime.datetime.now()
random_forest_model = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid_rf,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
random_forest_model.fit(X_train_preprocessed_rf, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"Random Forest:\tGridSearchCV took {gridsearch_time}")

Random Forest:	GridSearchCV took 0:00:53.696880


In [38]:
evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf,
    y_train=y_train, X_val=X_val_preprocessed_rf, y_val=y_val)

{'model': RandomForestClassifier(),
 'training_accuracy': 0.8256333830104322,
 'validation_accuracy': 0.8272727272727273}

## XGBoost
### Feature Selection

In [39]:
features_xgb = ["is_male", "Fare", "no_cabin", "Pclass", "Age"]

X_train_preprocessed_xgb = preprocess(X_train).reindex(
    columns=features_xgb, fill_value=0)

# Ensuring columns in X_val_preprocessed_rf are the same as the columns in
# X_train_preprocessed_rf since we have one hot encodings
X_val_preprocessed_xgb = preprocess(X_val).reindex(
    columns=features_xgb, fill_value=0)

In [40]:
# TODO: XGBoost is overfitting my training data.
# https://xgboost.readthedocs.io/en/stable/tutorials/categorical.html

# Trying ~100 combinations takes about 1 minute.
param_grid_xgb = [{
    'n_estimators': [200],
    'subsample': np.arange(start=0, stop=0.9, step=.1),
    'colsample_bynode': np.arange(start=0.3, stop=0.7, step=.1),
    'reg_lambda': [0.1, 1, 5]
}]

start_time = datetime.datetime.now()
xgboost_model = GridSearchCV(
    estimator=XGBRFClassifier(),
    param_grid=param_grid_xgb,
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    # Using roc_auc as we have a class imbalance.
    scoring='roc_auc'
)
xgboost_model.fit(X_train_preprocessed_xgb, y_train)
end_time = datetime.datetime.now()

gridsearch_time = end_time-start_time
print(f"XGBoost:\tGridSearchCV took {gridsearch_time}")

XGBoost:	GridSearchCV took 0:00:43.478958


In [41]:
xgboost_model.best_params_

{'colsample_bynode': 0.4,
 'n_estimators': 200,
 'reg_lambda': 0.1,
 'subsample': 0.8}

In [42]:
evaluate_model(model=xgboost_model, X_train=X_train_preprocessed_xgb,
               y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

{'model': XGBRFClassifier(base_score=None, booster=None, callbacks=None,
                 colsample_bylevel=None, colsample_bytree=None, device=None,
                 early_stopping_rounds=None, enable_categorical=False,
                 eval_metric=None, feature_types=None, gamma=None,
                 grow_policy=None, importance_type=None,
                 interaction_constraints=None, max_bin=None,
                 max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 multi_strategy=None, n_estimators=None, n_jobs=None,
                 num_parallel_tree=None, objective='binary:logistic',
                 random_state=None, reg_alpha=None, ...),
 'training_accuracy': 0.8584202682563339,
 'validation_accuracy': 0.8363636363636363}

### Leaderboard

In [43]:
# Adding models to the Leaderboard.
gender_model_evaluation = evaluate_model(
    model=gender_model, X_train=X_train, y_train=y_train,
    X_val=X_val, y_val=y_val)

random_forest_evaluation = evaluate_model(
    model=random_forest_model, X_train=X_train_preprocessed_rf, y_train=y_train,
    X_val=X_val_preprocessed_rf, y_val=y_val)

xgb_evaluation = evaluate_model(
    model=xgboost_model, X_train=X_train_preprocessed_xgb,
    y_train=y_train, X_val=X_val_preprocessed_xgb, y_val=y_val)

In [44]:
leaderboard = pd.DataFrame([
gender_model_evaluation, random_forest_evaluation, xgb_evaluation])

leaderboard.sort_values(by='validation_accuracy', ascending=False, inplace=True)

leaderboard.head(len(leaderboard))

Unnamed: 0,model,training_accuracy,validation_accuracy
2,"XGBRFClassifier(base_score=None, booster=None,...",0.85842,0.836364
1,RandomForestClassifier(),0.825633,0.827273
0,GenderModel(),0.780924,0.804545


# Submission

In [45]:
# Predicting on X_test.
X_test_preprocessed_rf = preprocess(X_test).reindex(
    columns=features_rf, fill_value=0)
y_pred_test = random_forest_model.predict(X_test_preprocessed_rf)

submission_df = pd.DataFrame({
    "PassengerId": X_test.PassengerId,
    "Survived": y_pred_test
})

submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [46]:
submission_df.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")

Submission saved to submission.csv
