# Setup

In [1]:
# Some code taken from https://github.com/ageron/handson-ml2

import sys
assert sys.version_info >= (3, 8)
import os

import sklearn
assert sklearn.__version__ >= "0.23.1"

import numpy as np
assert np.__version__ >= "1.19.1"

import seaborn as sns
assert sns.__version__ >= "0.10.1"

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# Load data

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')  

In [3]:
import pandas as pd
assert pd.__version__ >= "1.1.0"

# Model will be trained using this data.
def load_training_data():
    data = pd.read_csv('../data/titanic/train.csv')
    return data

# This data will only be used when submitting the final analysis.
# It also doesn't contain the Survived column.
def load_testing_data():
    data = pd.read_csv('../data/titanic/test.csv')
    return data

In [4]:
train_set, test_set = train_test_split(load_training_data(), test_size=0.2, random_state=42)
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
733,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
382,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
704,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
813,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [5]:
X_train = train_set.drop('Survived', axis=1)
y_train = train_set.Survived

In [6]:
X_test = test_set.drop('Survived', axis=1)
y_test = test_set.Survived

In [7]:
# inputs dataframe that has columns 'Ticket' and 'Fare'
def fare_per_person(df):
    return pd.DataFrame(df.groupby('Ticket')['Fare'].transform(lambda x: x / x.size))

In [8]:
cat_mode_transformer = Pipeline(steps=[
    ('missing_to_mode', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder()),
])

num_median_transformer = Pipeline(steps=[
    ('missing_to_median', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='box-cox', standardize=False)),
])

fare_per_person_transformer = Pipeline(steps=[
    ('per_person', FunctionTransformer(func=fare_per_person)),
    ('nan_to_median', SimpleImputer(strategy='median')),
    ('zero_to_median', SimpleImputer(missing_values=0.0, strategy='median')),
    ('power', PowerTransformer(method='box-cox', standardize=False)),
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        #('passengerId', 'passthrough',               ['PassengerId']),
        ('pclass',      cat_mode_transformer,        ['Pclass']),
        ('sex',         cat_mode_transformer,        ['Sex']),
        ('age',         num_median_transformer,      ['Age']),
        ('fare',        fare_per_person_transformer, ['Ticket','Fare']),
        ('embarked',    cat_mode_transformer,        ['Embarked']),
    ])

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [33]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())]) 

In [34]:
clf.fit(X_train, y_train)

In [35]:
clf.score(X_test, y_test)

0.8212290502793296

In [39]:
# Train on all the training data before predicting the final testing data
train_set_final = load_training_data()
X_train_final = train_set_final.drop('Survived', axis=1)
y_train_final = train_set_final.Survived

In [40]:
final_test_set = load_testing_data()

In [41]:
res = pd.concat([
    pd.DataFrame(final_test_set.PassengerId, columns=['PassengerId']),
    pd.DataFrame(clf.predict(final_test_set), columns=['Survived'])
    ], axis=1).astype({'Survived':'int'})
res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [42]:
res.to_csv('titanic_out.csv', index=False)