## Load Modules/Data and Show Initial Tables

In [1]:
# Imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

sns.set(style='ticks', color_codes=True) 

from os.path import isfile

from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Model Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Load the datasets
root_path = './datasets/titanic/'

train_path = root_path + 'train.csv'
test_path = root_path + 'test.csv'

paths = [train_path, test_path]

for path in paths:
    if not isfile(path):
        print(f"Dataset {path} not found. Please check that the dataset exists and the path is correct.")
        
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
    
def show_tables(n=10):
    display(train.head(n))
    display(test.head(n))
    
# Show the tables
show_tables()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


## Cleanup Data

### Drop non-essential and sparse data features

I had some duplicate checking here earlier, but I removed it because there don't appear to be duplicates in the data. That could be from clerical errors, but my results are pretty good.

In [2]:
# Name and ID are just useless identifiers. Name gives us some social class info so let's engineer it a bit.
# Ticket and Cabin might be useful if there was a correlation with cabin position and 
# survivability but they're pretty sparse. Fare should also correlate extremely well with these, 
# so I don't think they're necessary at all.
# Embarked is superfluous as all three classes of passenger may embark from any port.

useless = ['PassengerId', 'Ticket', 'Cabin']

# Extract Passenger IDs
train_ids = pd.DataFrame(train['PassengerId'])
test_ids = pd.DataFrame(test['PassengerId'])

train.drop(columns=useless, inplace=True)
test.drop(columns=useless, inplace=True)

# Show the adjusted tables
show_tables()

# Show types
display(train.dtypes)
display(test.dtypes)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S
5,0,3,"Moran, Mr. James",male,,0,0,8.4583,Q
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,S
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075,S
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,11.1333,S
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,30.0708,C


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S
5,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,9.225,S
6,3,"Connolly, Miss. Kate",female,30.0,0,0,7.6292,Q
7,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,29.0,S
8,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,7.2292,C
9,3,"Davies, Mr. John Samuel",male,21.0,2,0,24.15,S


Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

### Reduce Name Data

Gonna get the titles out of the name data (e.g. 'Mr.' 'Miss.', 'Col.', etc.)

In [3]:
french_dict = {'Mme.': 'Mrs.', 'Mlle.': 'Miss.'}

regal_titles = ['Don.', 'Countess.', 'Capt.', 'Jonkheer.', 'Dona.', 'Col.', 'Lady.', 'Major.', 'Sir.']

def get_titles(name):
    title_exp = r'[A-Za-z \-\']+,[A-Za-z\s]*\s(?P<Title>[A-Za-z]+\.)\s+.+'
    title_exp = re.compile(title_exp)

    match = title_exp.match(name)
    if match:
        title = match.group('Title')

        # Apply French dictionary for those passengers
        title = title if title not in french_dict else french_dict[title]
        
        # Collapse Lordship/Military into one class
        title = title if title not in regal_titles else 'Upper.'
        
        return title
    
    else:
        print(name)
        return ''

# Extract titles into
train['Title'] = train['Name'].apply(get_titles)
test['Title'] = test['Name'].apply(get_titles)

train.drop(columns=['Name'], inplace=True)
test.drop(columns=['Name'], inplace=True)

### Check for Missing Data

In [4]:
# Check for nulls
display("Train Nulls?", train.isnull().any())
display("Test Nulls?", test.isnull().any())

'Train Nulls?'

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Embarked     True
Title       False
dtype: bool

'Test Nulls?'

Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare         True
Embarked    False
Title       False
dtype: bool

In [5]:
# Found nulls in train['Age'], test['Age'], and test['Fare']
display("Train Nulls")
display(train[train['Age'].isnull()].head())

display("Test Nulls")
display(test[test['Age'].isnull()].head())
display(test[test['Fare'].isnull()].head())

'Train Nulls'

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
5,0,3,male,,0,0,8.4583,Q,Mr.
17,1,2,male,,0,0,13.0,S,Mr.
19,1,3,female,,0,0,7.225,C,Mrs.
26,0,3,male,,0,0,7.225,C,Mr.
28,1,3,female,,0,0,7.8792,Q,Miss.


'Test Nulls'

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
10,3,male,,0,0,7.8958,S,Mr.
22,1,female,,0,0,31.6833,S,Mrs.
29,3,male,,2,0,21.6792,C,Mr.
33,3,female,,1,2,23.45,S,Mrs.
36,3,female,,0,0,8.05,S,Miss.


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
152,3,male,60.5,0,0,,S,Mr.


### Impute

In [6]:
# Impute by Title

def get_val_by_title(df, feat):
    title_val = {}
    for title in df['Title']:
        if title not in title_val:
            title_val[title] = df[df['Title'] == title][feat].mean()
            
    return title_val

def impute_val_by_title(df, feat):
    for title in df['Title']:
        df[feat] = df[feat].fillna(get_val_by_title(df, feat)[title])
    
impute_val_by_title(train, 'Age')
impute_val_by_title(test, 'Age')
impute_val_by_title(test, 'Fare')


### Discretize data

Only thing that needs to be discretized after dropping columns is 'Sex'

In [7]:
# Taken from module 3 notebook
def encode_onehot(df, f):
    df2 = pd.get_dummies(df[f], prefix='', prefix_sep='').max(level=0, axis=1).add_prefix(f+' - ')
    df3 = pd.concat([df, df2], axis=1)
    df3 = df3.drop([f], axis=1)
    return df3

discretize = ['Sex', 'Title', 'Embarked']

for dis in discretize:
    if dis in train:
        train = encode_onehot(train, dis)
        test = encode_onehot(test, dis)

In [8]:
# Test df now is missing columns, so let's cleanup the data a bit.

# Get the title columns
titles = list(train.columns) + list(test.columns)
titles = list(set(titles))
titles = [title for title in titles if title.startswith('Title')]
print(titles)

def add_titles(df, titles=titles):
    for title in titles:
        if title not in df.columns:
            df[title] = [0]*len(df)
            
    return df.reindex(sorted(df.columns), axis=1)

            
train = add_titles(train)
test = add_titles(test)
show_tables()

['Title - Mr.', 'Title - Mrs.', 'Title - Miss.', 'Title - Master.', 'Title - Dr.', 'Title - Ms.', 'Title - Upper.', 'Title - Rev.']


Unnamed: 0,Age,Embarked - C,Embarked - Q,Embarked - S,Fare,Parch,Pclass,Sex - female,Sex - male,SibSp,Survived,Title - Dr.,Title - Master.,Title - Miss.,Title - Mr.,Title - Mrs.,Title - Ms.,Title - Rev.,Title - Upper.
0,22.0,0,0,1,7.25,0,3,0,1,1,0,0,0,0,1,0,0,0,0
1,38.0,1,0,0,71.2833,0,1,1,0,1,1,0,0,0,0,1,0,0,0
2,26.0,0,0,1,7.925,0,3,1,0,0,1,0,0,1,0,0,0,0,0
3,35.0,0,0,1,53.1,0,1,1,0,1,1,0,0,0,0,1,0,0,0
4,35.0,0,0,1,8.05,0,3,0,1,0,0,0,0,0,1,0,0,0,0
5,32.36809,0,1,0,8.4583,0,3,0,1,0,0,0,0,0,1,0,0,0,0
6,54.0,0,0,1,51.8625,0,1,0,1,0,0,0,0,0,1,0,0,0,0
7,2.0,0,0,1,21.075,1,3,0,1,3,0,0,1,0,0,0,0,0,0
8,27.0,0,0,1,11.1333,2,3,1,0,0,1,0,0,0,0,1,0,0,0
9,14.0,1,0,0,30.0708,0,2,1,0,1,1,0,0,0,0,1,0,0,0


Unnamed: 0,Age,Embarked - C,Embarked - Q,Embarked - S,Fare,Parch,Pclass,Sex - female,Sex - male,SibSp,Title - Dr.,Title - Master.,Title - Miss.,Title - Mr.,Title - Mrs.,Title - Ms.,Title - Rev.,Title - Upper.
0,34.5,0,1,0,7.8292,0,3,0,1,0,0,0,0,1,0,0,0,0
1,47.0,0,0,1,7.0,0,3,1,0,1,0,0,0,0,1,0,0,0
2,62.0,0,1,0,9.6875,0,2,0,1,0,0,0,0,1,0,0,0,0
3,27.0,0,0,1,8.6625,0,3,0,1,0,0,0,0,1,0,0,0,0
4,22.0,0,0,1,12.2875,1,3,1,0,1,0,0,0,0,1,0,0,0
5,14.0,0,0,1,9.225,0,3,0,1,0,0,0,0,1,0,0,0,0
6,30.0,0,1,0,7.6292,0,3,1,0,0,0,0,1,0,0,0,0,0
7,26.0,0,0,1,29.0,1,2,0,1,1,0,0,0,1,0,0,0,0
8,18.0,1,0,0,7.2292,0,3,1,0,0,0,0,0,0,1,0,0,0
9,21.0,0,0,1,24.15,0,3,0,1,2,0,0,0,1,0,0,0,0


## Construct and Run Pipeline

Of the models tested, RBF SVC with default parameters runs best overall with a mean cross validated accuracy of around 83%

In [9]:
# Prepare training set
x = train.loc[:, train.columns != 'Survived'].values
y = train.loc[:, train.columns == 'Survived'].values.ravel()

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# Model Declaration
models = [KNeighborsClassifier(), LogisticRegression(), GaussianNB(), 
          DecisionTreeClassifier(), RandomForestClassifier(), SVC()]
accuracies = []
p_lines = {}

# Construct Pipeline and Train/Test
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for model in models:
    pipe_lr = make_pipeline(StandardScaler(), model)
    pipe_lr.fit(x, y)
    
    score = cross_val_score(pipe_lr, x, y, cv=kfold, scoring='accuracy')

    accuracies.append((type(model), np.mean(score)))
       
    p_lines[type(model)] = pipe_lr

# Sort by the mean of the accuracies
accuracies.sort(key=lambda x: np.mean(x[1]), reverse=True)
display(accuracies)

[(sklearn.svm._classes.SVC, 0.8271660424469414),
 (sklearn.linear_model._logistic.LogisticRegression, 0.8260424469413234),
 (sklearn.ensemble._forest.RandomForestClassifier, 0.8215355805243446),
 (sklearn.tree._classes.DecisionTreeClassifier, 0.8103121098626715),
 (sklearn.neighbors._classification.KNeighborsClassifier, 0.810287141073658),
 (sklearn.naive_bayes.GaussianNB, 0.7575280898876404)]

## Export for Kaggle

In [10]:
def save_preds(_fn, _y_pred, _df):
    import csv
    with open(_fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['PassengerId', 'Survived'])
        for y, passengerId in zip(_y_pred, _df['PassengerId']):
            writer.writerow([passengerId, y])
            
# Get the predicition with the best model
y_pred = p_lines[accuracies[0][0]].predict(test).astype(np.int)
# Save predictions
save_preds('predictions_mcelhenney.csv', y_pred, test_ids)