# Setup

In [1]:
import sklearn
assert sklearn.__version__ >= "0.23.1"

import numpy as np
assert np.__version__ >= "1.19.1"

import seaborn as sns
assert sns.__version__ >= "0.10.1"


# Load data

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')  

In [3]:
import pandas as pd
assert pd.__version__ >= "1.1.0"

# Model will be trained using this data.
def load_training_data():
    data = pd.read_csv('../data/titanic/train.csv')
    return data

# This data will only be used when submitting the final analysis.
# It also doesn't contain the Survived column.
def load_testing_data():
    data = pd.read_csv('../data/titanic/test.csv')
    return data

In [4]:
train_set, test_set = train_test_split(load_training_data(), test_size=0.2, random_state=1)
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
301,302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q
309,310,1,1,"Francatelli, Miss. Laura Mabel",female,30.0,0,0,PC 17485,56.9292,E36,C
516,517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34.0,0,0,C.A. 34260,10.5,F33,S
120,121,0,2,"Hickman, Mr. Stanley George",male,21.0,2,0,S.O.C. 14879,73.5,,S
570,571,1,2,"Harris, Mr. George",male,62.0,0,0,S.W./PP 752,10.5,,S


In [5]:
X_train = train_set.drop('Survived', axis=1)
y_train = train_set.Survived
X_test = test_set.drop('Survived', axis=1)
y_test = test_set.Survived

In [6]:
# The output of this doesn't have any missing values
preprocessor1 = ColumnTransformer(
    transformers=[
        ('pclass',          'passthrough',                                    ['Pclass']),
        ('sex',             'passthrough',                                    ['Sex']),
        ('age',             SimpleImputer(strategy='median'),                 ['Age']),
        ('sibsp',           'passthrough',                                    ['SibSp']),
        ('parch',           'passthrough',                                    ['Parch']),
        ('ticket',          SimpleImputer(strategy='constant',fill_value=''), ['Ticket']),
        ('fare',            SimpleImputer(strategy='median'),                 ['Fare']),
        ('embarked',        SimpleImputer(strategy='most_frequent'),          ['Embarked']),
    ])
preprocessor1

In [7]:
# Some functions originally written for Pandas that are now getting an ndarray as input..

def age_class(arr):
    df = pd.DataFrame(arr)
    return pd.DataFrame(pd.cut(df.iloc[:,0], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
       labels=['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89',  '90-99']))

def is_alone_np(arr):
    return pd.DataFrame((arr[:,0]+arr[:,1]) == 0)

def fare_per_person(arr):
    df = pd.DataFrame(arr, columns=['Ticket','Fare'])
    return pd.DataFrame(df.groupby('Ticket')['Fare'].transform(lambda x: x / x.size))

In [8]:
# This calculates new variables
preprocessor2 = ColumnTransformer(
    transformers=[
        ('pclass',          'passthrough',                             [0]),
        ('sex',             'passthrough',                             [1]),
        ('age_class',       FunctionTransformer(func=age_class),       [2]),
        ('age',             'passthrough',                             [2]),
        ('is_alone',        FunctionTransformer(func=is_alone_np),     [3,4]),
        ('fare',            'passthrough',                             [6]),
        ('fare_per_person', FunctionTransformer(func=fare_per_person), [5,6]),
        ('embarked',        'passthrough',                             [7]),
    ])
preprocessor2

In [9]:
# This calculates new variables
preprocessor3 = ColumnTransformer(
    transformers=[
        ('pclass',          'passthrough',       [0]),
        ('sex',             OneHotEncoder(),     [1]),
        ('age_class',       OneHotEncoder(),     [2]),
        ('age',             'passthrough',       [3]),
        ('is_alone',        OneHotEncoder(),     [4]),
        ('fare',            'passthrough',       [5]),
        ('fare_per_person', 'passthrough',       [6]),
        ('embarked',        OneHotEncoder(),     [7]),
    ])
preprocessor3

In [10]:
# Just testing
pd.DataFrame(Pipeline(steps=[
    ('pp1', preprocessor1),
]).fit_transform(X_train)).sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7
462,3,female,28.0,0,0,349245,7.8958,S
56,3,female,29.0,1,0,2689,14.4583,C
393,2,female,30.0,0,0,250648,13.0,S
670,2,male,25.0,1,0,236853,26.0,S
151,2,female,2.0,1,1,26360,26.0,S
198,3,male,36.0,0,0,LINE,0.0,S
600,3,male,24.0,0,0,349209,7.4958,S
213,2,male,0.67,1,1,250649,14.5,S
353,2,male,36.0,0,0,SC/Paris 2163,12.875,C
609,3,male,27.0,0,0,349219,7.8958,S


In [11]:
# Just testing
pd.DataFrame(Pipeline(steps=[
    ('pp1', preprocessor1),
    ('pp2', preprocessor2),
]).fit_transform(X_train)).sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7
655,2,female,0-9,3,False,41.5792,13.8597,C
252,3,male,10-19,20,True,8.05,8.05,S
471,3,male,20-29,24,True,7.7958,7.7958,S
625,2,male,40-49,42,True,13.0,13.0,S
673,2,female,20-29,24,False,26.0,13.0,S
451,3,male,30-39,40,False,15.5,7.75,Q
431,3,male,10-19,16,True,9.2167,4.60835,S
528,1,male,40-49,45,True,26.55,26.55,S
376,3,male,20-29,28,True,7.7958,7.7958,S
169,1,female,20-29,30,True,31.0,15.5,C


In [12]:
# Just testing
pd.DataFrame(Pipeline(steps=[
    ('pp1', preprocessor1),
    ('pp2', preprocessor2),
    ('pp3', preprocessor3),
]).fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,3,0,1,0,0,1,0,0,0,0,0,29,1,0,23.25,11.625,0,1,0
1,1,1,0,0,0,1,0,0,0,0,0,30,0,1,56.9292,56.9292,1,0,0
2,2,1,0,0,0,0,1,0,0,0,0,34,0,1,10.5,10.5,0,0,1
3,2,0,1,0,0,1,0,0,0,0,0,21,1,0,73.5,18.375,0,0,1
4,2,0,1,0,0,0,0,0,0,1,0,62,0,1,10.5,10.5,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,3,0,1,0,1,0,0,0,0,0,0,19,0,1,7.65,7.65,0,0,1
708,3,1,0,0,0,0,1,0,0,0,0,30.5,0,1,7.75,7.75,0,1,0
709,2,0,1,0,0,1,0,0,0,0,0,21,0,1,73.5,18.375,0,0,1
710,3,1,0,0,0,1,0,0,0,0,0,29,0,1,7.55,7.55,0,0,1


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [14]:
clf = Pipeline(steps=[
    ('pp1', preprocessor1),
    ('pp2', preprocessor2),
    ('pp3', preprocessor3),
    ('classifier', RandomForestClassifier(n_estimators=500))
]) 

In [15]:
clf.fit(X_train, y_train)

In [16]:
clf.score(X_test, y_test)

0.7821229050279329

In [17]:
final_test_set = load_testing_data()

In [18]:
res = pd.concat([
    pd.DataFrame(final_test_set.PassengerId, columns=['PassengerId']),
    pd.DataFrame(clf.predict(final_test_set), columns=['Survived'])
    ], axis=1).astype({'Survived':'int'})
res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [19]:
res.to_csv('titanic_out.csv', index=False)