In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.width", 1500)
pd.plotting.register_matplotlib_converters()

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Read and split the data

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv", index_col="PassengerId")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv", index_col="PassengerId")

In [3]:
Y_train_full = train_data["Survived"]
X_train_full = train_data.drop("Survived", axis=1).copy()

X_test = test_data

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_full, Y_train_full, test_size=0.2, random_state=42)

# Define pre-processing of the data

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.base import TransformerMixin

import re

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

name_transformer = Pipeline(steps=[
    ("convert_to_title", FunctionTransformer(lambda df: df.applymap(
            lambda name: re.search(", ([\w ]+).", name).group(1)
    ))),
    ("rare_to_others", FunctionTransformer(lambda df: df.applymap(
        lambda title: title if title in ["Mr", "Mrs", "Miss", "Master"] else "Others"
    ))),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

fare_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("log", FunctionTransformer(np.log1p))
])

family_transformer = FunctionTransformer(lambda df: df.sum(axis=1).to_frame())

remember_missing_transformer = FunctionTransformer(lambda df: df.apply(lambda col: np.where(col.isnull(), 1, 0)))

# A custom transformer for age group-mean, so that mean is from train set always (the one fitted to).
class GroupbyMeanTransformer(TransformerMixin):
    def __init__(self, group_by_labels, target_label):
        self.group_by_labels = group_by_labels
        self.target_label = target_label

    def fit(self, X, y=None):
        self.group_mean = self._get_grouped(X).mean()
        return self

    def transform(self, X):
        return self._get_grouped(X).transform(lambda x: x.fillna(self.group_mean[x.name])).to_frame()
    
    def _get_grouped(self, X):
        return X.groupby(self.group_by_labels)[self.target_label]


age_transformer = Pipeline(steps=[
    ("imputer", GroupbyMeanTransformer(group_by_labels="Pclass Sex".split(), target_label="Age")),
    ("bin", FunctionTransformer(lambda df: pd.cut(df.Age, bins=[0, 16, 30, 50, 80], labels=False).to_frame() + 1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("keep", FunctionTransformer(), ["Pclass"]),
        ("onehot", categorical_transformer, "Sex Embarked".split()),
        ("name", name_transformer, ["Name"]),
        ("fare", fare_transformer, ["Fare"]),
        ("family", family_transformer, ["Parch", "SibSp"]),
        ("missing", remember_missing_transformer, ["Age"]),
        ("age", age_transformer, ["Age", "Pclass", "Sex"])
    ]
)

preprocessed_column_names = ["Pclass"] + sorted("Female Male".split()) + sorted("C Q S".split()) + sorted("Master Miss Mr Mrs Others".split()) + ["Fare", "Family"] + sorted("Age_Missing".split()) + ["Age"]

# Pre-processed data sample

In [6]:
# The preprocessor is fitted to train data, and used to valid/test data, and is not fitted to valid/test.
# This avoids cross-contamination of information from valid/test into train data.
X_train_preproc = preprocessor.fit_transform(X_train)
X_valid_preproc = preprocessor.transform(X_valid)
X_test_preproc = preprocessor.transform(X_test)


In [7]:
pd.DataFrame(X_train_preproc, index=X_train.index, columns=preprocessed_column_names).head()

Unnamed: 0_level_0,Pclass,Female,Male,C,Q,S,Master,Miss,Mr,Mrs,Others,Fare,Family,Age_Missing,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
332,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.38439,0.0,0.0,3.0
734,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.639057,0.0,0.0,2.0
383,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.188856,0.0,0.0,3.0
705,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.180892,1.0,0.0,2.0
814,3.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,3.474293,6.0,0.0,1.0


# Define the model and the pipeline

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

def fit_model(model, X, y, **fit_kwargs):
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    pipeline.fit(X, y, **fit_kwargs)
    return pipeline

def score_model(model, X, y):
    y_pred = model.predict(X)
    score = accuracy_score(y, y_pred)
    print("Accuracy:", score)
    return score

def score_cross_val(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")

    print("Scores:", scores)
    score = scores.mean()
    print("Avg score:", score)

    return score

# Random Forest based

In [9]:
from sklearn.ensemble import RandomForestClassifier

def get_random_forest_model(X_train, Y_train, X_valid, Y_valid, n_estimators=100):
    model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    model = fit_model(model, X_train, Y_train)
    score = score_model(model, X_valid, Y_valid)
    return model, score

rf_model, _ = get_random_forest_model(X_train, Y_train, X_valid, Y_valid)

Accuracy: 0.8435754189944135


In [10]:
score_cross_val(rf_model, X_train_full, Y_train_full)

Scores: [0.79888268 0.79213483 0.83707865 0.7752809  0.81460674]
Avg score: 0.8035967610319503


0.8035967610319503

# XGBoost based

In [11]:
from xgboost import XGBClassifier

def get_xgb_model(X_train, Y_train, X_valid, Y_valid, n_estimators=1000, learning_rate=0.05):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, n_jobs=4)
    model = fit_model(model, X_train, Y_train, model__early_stopping_rounds=5, model__eval_set=[(X_valid_preproc, Y_valid)], model__verbose=False)
    score = score_model(model, X_valid, Y_valid)
    return model, score

xgb_model, _ = get_xgb_model(X_train, Y_train, X_valid, Y_valid)

Accuracy: 0.8491620111731844


In [12]:
score_cross_val(xgb_model, X_train_full, Y_train_full)

Scores: [0.80446927 0.79775281 0.85393258 0.81460674 0.84831461]
Avg score: 0.8238152030632101


0.8238152030632101

# Predict

In [13]:
def store_predictions(model, submission_name):
    predictions = model.predict(X_test)

    print(f"{submission_name}:\n{predictions[:100]}...")
    
    output = pd.DataFrame({"Survived": predictions}, index=test_data.index)
    output.to_csv(f"/kaggle/working/{submission_name}_submission.csv", index=True)

In [14]:
store_predictions(rf_model, "rf")
store_predictions(xgb_model, "xgb")

rf:
[0 0 0 0 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0]...
xgb:
[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 1 0 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1 0 0 0]...
