In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.width", 1500)

from titanic.titanic_data import load_titanic_data, split_data, get_data_preprocessor

# Read and split the data

In [2]:
X_train_full, y_train_full, X_pred = load_titanic_data()

In [3]:
X_train, X_valid, y_train, y_valid = split_data(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define pre-processing of the data

In [4]:
preprocessor, preprocessed_column_names = get_data_preprocessor()

In [5]:
pd.DataFrame(preprocessor.fit_transform(X_train), index=X_train.index, columns=preprocessed_column_names).head()

Unnamed: 0_level_0,Pclass,Female,Male,C,Q,S,Master,Miss,Mr,Mrs,Others,Fare,Family,Age_Missing,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
332,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.38439,0.0,0.0,3.0
734,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.639057,0.0,0.0,2.0
383,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.188856,0.0,0.0,3.0
705,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.180892,1.0,0.0,2.0
814,3.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,3.474293,6.0,0.0,1.0


# Define the model and the pipeline

In [6]:
from utils.scoring import score_model, score_cross_val
from utils.training import fit_model

# Random Forest based

In [7]:
from sklearn.ensemble import RandomForestClassifier

def get_random_forest_model(X_train, y_train, X_valid, y_valid, n_estimators=100):
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    model = fit_model(model, preprocessor, X_train, y_train)
    score = score_model(model, X_valid, y_valid)
    return model, score

rf_model, _ = get_random_forest_model(X_train, y_train, X_valid, y_valid)

Accuracy: 0.8435754189944135


In [8]:
score_cross_val(rf_model, X_train_full, y_train_full)

Scores: [0.79888268 0.79213483 0.83707865 0.7752809  0.81460674]
Avg score: 0.8035967610319503


0.8035967610319503

# XGBoost based

In [9]:
from xgboost import XGBClassifier

def get_xgb_model(X_train, y_train, X_valid, y_valid, n_estimators=1000, learning_rate=0.05):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, n_jobs=4)
    model = fit_model(model, preprocessor, X_train, y_train, model__early_stopping_rounds=5, model__eval_set=[(preprocessor.fit(X_train).transform(X_valid), y_valid)], model__verbose=False)
    score = score_model(model, X_valid, y_valid)
    return model, score

xgb_model, _ = get_xgb_model(X_train, y_train, X_valid, y_valid)

Accuracy: 0.8491620111731844


In [10]:
score_cross_val(xgb_model, X_train_full, y_train_full)

Scores: [0.80446927 0.79775281 0.85393258 0.81460674 0.84831461]
Avg score: 0.8238152030632101


0.8238152030632101

# Predict

In [11]:
from utils.predicting import store_predictions

store_predictions(rf_model, X_pred, X_pred.index, "rf")
store_predictions(xgb_model, X_pred, X_pred.index, "xgb")

rf:
[0 0 0 0 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0]...
xgb:
[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 0]...
