In [142]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%run models.py

In [157]:
train_in = './data/titanic_train_clean.csv'
train_out = './data/titanic_train_engineered.csv'
test_in = './data/titanic_test_clean.csv'
test_out = './data/titanic_test_engineered.csv'
log = './logs/log.csv'

In [158]:
title_codes = {
    'Mr': 1,       # General adult male
    'Mrs': 2,      # General adult female
    'Miss': 3,     # General young female
    'Master': 4,   # General young male
    'Don': 5,      # Noble male
    'Rev': 6,      # Professional
    'Dr': 6,       # Professional
    'Mme': 2,      # General adult female
    'Ms': 2,       # General adult female
    'Major': 6,    # Professional
    'Lady': 7,     # Noble female
    'Sir' : 5,     # Noble male
    'Mlle': 3,     # General young female
    'Col': 6,      # Professional
    'Capt': 6,     # Professional
    'Countess': 7, # Noble female
    'Jonkheer': 5, # Noble male
    'Dona': 7      # Noble Female
}

In [159]:
# Check the df

infile = train_in
outfile = train_out

df = pd.read_csv(infile)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,2.0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,1.0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,1.0


In [160]:
# Basic variable creation

df['family_size'] = df['sibsp'] + df['parch'] + 1
df.drop('cabin', axis=1, inplace=True)

In [161]:
# Creation of title variable

def title(row):
    title_search = re.search(' ([A-Za-z]+)\.', row['name'])
    # If a title exists, extract and return it
    if title_search:
        return title_search.group(1)
    return ""

df['title'] = df.apply(title, axis=1)
df['title'].replace(title_codes, inplace=True)

In [162]:
# Inferring missing age values from title

def infer_age(row):
    '''
    Infers the age for nan values
    '''
    if(pd.isnull(row['age'])):
        
        if row['title'] == 1:    # Mr
            return 30
        elif row['title']  == 2:  # Mrs
            return 35
        elif row['title']  == 3:  # Miss
            return 21
        elif row['title']  == 4:  # Master
            return 4
        elif row['title']  == 5:  # Noble male
            return 40
        elif row['title']  == 6:  # Professional
            return 50
        elif row['title']  == 7:  # Noble female
            return 40

    else:
        return row['age']

df['age'] = df.apply(infer_age, axis=1)

In [163]:
# Exploring Tickets

# Only 27% of fares are unique
# Some ticket values are also grouped
# Fare value tends to identify homogeneous groups some of which may share ethnicity based upon their surname
# Language may have been a barrier to lower class passengers being directed to and getting aboard lifeboats
# Could be potential in treating it as a categorical variable and clustering to reduce the dimensions.

# Getting the true cost of individual tickets
ticket_split = df['ticket'].str.split().tolist()
ticket_number = [x[-1] for x in ticket_split]
df['ticket_number'] = ticket_number
tb_ticket_counts = df[['passengerid', 'ticket_number']].groupby('ticket_number').count()
tb_ticket_counts = tb_ticket_counts.reset_index().rename(columns={'passengerid':'group_size'})
df = pd.merge(
    left=df,
    right=tb_ticket_counts,
    how='left'
)

In [164]:
# Building a categorical variable for family size

def family_sizer(row):
    if row['family_size'] == 1:
        return 1
    elif row['family_size'] < 5:
        return 2
    else:
        return 3
    
df['family_size_cat'] = df.apply(family_sizer, axis=1)

In [199]:
# Family size survival
df['passenger'] = 1
tab_fam = df[['family_size', 'survived', 'passenger']].groupby('family_size').sum()
tab_fam['rate'] = tab_fam['survived'] / tab_fam['passenger']
tab_fam

Unnamed: 0_level_0,survived,passenger,rate
family_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,163,537,0.303538
2,89,161,0.552795
3,59,102,0.578431
4,21,29,0.724138
5,3,15,0.2
6,3,22,0.136364
7,4,12,0.333333
8,0,6,0.0
11,0,7,0.0


In [200]:
# Group size survival

df['passenger'] = 1
tab_group = df[['group_size', 'survived', 'passenger']].groupby('group_size').sum()
tab_group['rate'] = tab_group['survived'] / tab_group['passenger']
tab_group

Unnamed: 0_level_0,survived,passenger,rate
group_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,161,544,0.295956
2,107,188,0.569149
3,47,66,0.712121
4,22,44,0.5
5,0,10,0.0
6,0,18,0.0
7,5,21,0.238095


In [165]:
# Building a categorical variable for group size

def group_sizer(row):
    if row['group_size'] == 1:
        return 1
    elif row['group_size'] < 5:
        return 2
    else:
        return 3
    
df['group_size_cat'] = df.apply(group_sizer, axis=1)

In [166]:
# One hot encoding sex, title & Embarked

df_sex = pd.get_dummies(df['sex'])
df_sex.columns = ['sex_{}'.format(x) for x in df_sex.columns]

df_title = pd.get_dummies(df['title'])
df_title.columns = ['title_{}'.format(x) for x in df_title.columns]

df_embarked = pd.get_dummies(df['embarked'])
df_embarked.columns = ['embarked_{}'.format(int(x)) for x in df_embarked.columns]

df_family = pd.get_dummies(df['family_size_cat'])
df_family.columns = ['family_size_{}'.format(x) for x in df_family.columns]

df_group = pd.get_dummies(df['group_size_cat'])
df_group.columns = ['group_size_{}'.format(x) for x in df_group.columns]

df = pd.concat([df, df_sex, df_title, df_embarked, df_family, df_group], axis=1)
# df.drop(['sex', 'title', 'embarked', 'family_size_cat', 'group_size_cat'], axis=1, inplace=True)
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,...,title_7,embarked_1,embarked_2,embarked_3,family_size_1,family_size_2,family_size_3,group_size_1,group_size_2,group_size_3
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,...,0,1,0,0,0,1,0,1,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,...,0,0,1,0,0,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,...,0,1,0,0,1,0,0,1,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,...,0,1,0,0,0,1,0,0,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,...,0,1,0,0,1,0,0,1,0,0


In [167]:
# Scaling
from sklearn.preprocessing import MinMaxScaler

df['fare'].fillna(df['fare'].median(), inplace=True)

df['true_fare'] = round(df['fare'] / df['group_size'],2)
scaler = MinMaxScaler()
fare_array = df['true_fare'].as_matrix()
age_array = df['age'].as_matrix()
fare_scaled = scaler.fit_transform(fare_array.reshape(-1, 1))
age_scaled = scaler.fit_transform(age_array.reshape(-1, 1))
df['true_fare_scaled'] = fare_scaled
df['age_scaled'] = age_scaled

Unnamed: 0_level_0,survived,passenger,rate
family_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,163,537,0.303538
2,89,161,0.552795
3,59,102,0.578431
4,21,29,0.724138
5,3,15,0.2
6,3,22,0.136364
7,4,12,0.333333
8,0,6,0.0
11,0,7,0.0


In [197]:
df['passenger'] = 1
tab_group = df[['group_size', 'survived', 'passenger']].groupby('group_size').sum()
tab_group['rate'] = tab_group['survived'] / tab_group['passenger']
tab_group

Unnamed: 0_level_0,survived,passenger,rate
group_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,161,544,0.295956
2,107,188,0.569149
3,47,66,0.712121
4,22,44,0.5
5,0,10,0.0
6,0,18,0.0
7,5,21,0.238095


In [169]:
# Model Parameters

models = [{
    'name': 'Logistic Regression',
    'model': LogisticRegression()
}, {
    'name': 'SVC',
    'model': SVC()
}, {
    'name': 'Linear SVC',
    'model': LinearSVC()
}, {
    'name': 'Naive Bayes',
    'model': GaussianNB()
}, {
    'name': 'Random Forest',
    'model': RandomForestClassifier()
}, {
    'name': 'Gradient Boosting',
    'model': GradientBoostingClassifier(max_depth=15)
}, {
    'name': 'Decision Tree',
    'model': DecisionTreeClassifier()
}, {
    'name': 'K Nearest Neighbours',
    'model': KNeighborsClassifier(n_neighbors=4),
}, {
    'name': 'MLP Classifier',
    'model': MLPClassifier()
}]

In [183]:
pd.set_option('display.max_colwidth', -1)

In [186]:
# Testing the Features
feature_names = [
    'pclass', 'age_scaled', 'true_fare_scaled', 
#     'embarked_1', 'embarked_2', 'embarked_3',
    'family_size', 'group_size',
    'family_size_1', 'family_size_2', 'family_size_3',
    'group_size_1', 'group_size_2', 'group_size_3',
    'sex', 
    'sex_0', 'sex_1',
    'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7'
]

features = df[feature_names]
label = df[['survived']]
%run models.py
test_models(models, feature_names, features, label, 5, './logs/log.csv')
log = pd.read_csv('./logs/log.csv')
log.sort_values('performance', ascending=False)

Logistic Regression Performance: 82.38%
SVC Performance: 83.62%
Linear SVC Performance: 82.38%
Naive Bayes Performance: 81.7%
Random Forest Performance: 79.68%
Gradient Boosting Performance: 79.12%
Decision Tree Performance: 77.32%
K Nearest Neighbours Performance: 81.27%
MPL Classifier Performance: 82.83%


Unnamed: 0,cv,features,model,performance
1,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",SVC,83.62
8,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",MPL Classifier,82.83
0,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",Logistic Regression,82.38
2,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",Linear SVC,82.38
3,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",Naive Bayes,81.7
7,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",K Nearest Neighbours,81.27
4,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",Random Forest,79.68
5,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",Gradient Boosting,79.12
6,5,"['pclass', 'age_scaled', 'true_fare_scaled', 'family_size', 'group_size', 'family_size_1', 'family_size_2', 'family_size_3', 'group_size_1', 'group_size_2', 'group_size_3', 'sex', 'sex_0', 'sex_1', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7']",Decision Tree,77.32
