## Load Modules/Data and Show Initial Tables

In [1]:
# Imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import re

sns.set(style='ticks', color_codes=True) 

from os.path import isfile

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import KNNImputer

# Model Imports
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Load the datasets
root_path = './datasets/titanic/'

train_path = root_path + 'train.csv'
test_path = root_path + 'test.csv'

paths = [train_path, test_path]

for path in paths:
    if not isfile(path):
        print(f"Dataset {path} not found. Please check that the dataset exists and the path is correct.")
        
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
    
def show_tables(n=10):
    display(train.head(n))
    display(test.head(n))
    
# Show the tables
show_tables()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


## Cleanup Data

### Drop non-essential and sparse data features

I had some duplicate checking here earlier, but I removed it because there don't appear to be duplicates in the data. That could be from clerical errors, but my results are pretty good.

In [2]:
# Name and ID are just useless identifiers. Name gives us some social class info so let's engineer it a bit.
# Ticket and Cabin might be useful if there was a correlation with cabin position and 
# survivability but they're pretty sparse. Fare should also correlate extremely well with these, 
# so I don't think they're necessary at all.
# Embarked is superfluous as all three classes of passenger may embark from any port.

useless = ['PassengerId', 'Ticket', 'Cabin', 'Embarked']

# Extract Passenger IDs
train_ids = pd.DataFrame(train['PassengerId'])
test_ids = pd.DataFrame(test['PassengerId'])

print(train_ids)

train.drop(columns=useless, inplace=True)
test.drop(columns=useless, inplace=True)

# Show the adjusted tables
show_tables()

# Show types
display(train.dtypes)
display(test.dtypes)

     PassengerId
0              1
1              2
2              3
3              4
4              5
..           ...
886          887
887          888
888          889
889          890
890          891

[891 rows x 1 columns]


Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05
5,0,3,"Moran, Mr. James",male,,0,0,8.4583
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,21.075
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,11.1333
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,30.0708


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875
5,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,9.225
6,3,"Connolly, Miss. Kate",female,30.0,0,0,7.6292
7,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,29.0
8,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,7.2292
9,3,"Davies, Mr. John Samuel",male,21.0,2,0,24.15


Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
dtype: object

Pclass      int64
Name       object
Sex        object
Age       float64
SibSp       int64
Parch       int64
Fare      float64
dtype: object

### Engineer Name Data

Gonna get the titles out of the name data (e.g. 'Mr.' 'Miss.', 'Col.', etc.)

In [3]:
def get_titles(name):
    title_exp = r'[A-Za-z \-\']+,[A-Za-z\s]*\s(?P<Title>[A-Za-z]+\.)\s+.+'
    title_exp = re.compile(title_exp)

    match = title_exp.match(name)
    if match:
        title = match.group('Title')        
        return title
    
    else:
        print(name)
        return ''

# Extract titles into
train['Title'] = train['Name'].apply(get_titles)
test['Title'] = test['Name'].apply(get_titles)

train.drop(columns=['Name'], inplace=True)
test.drop(columns=['Name'], inplace=True)

In [4]:
# Check for nulls
display("Train Nulls?", train.isnull().any())
display("Test Nulls?", test.isnull().any())

'Train Nulls?'

Survived    False
Pclass      False
Sex         False
Age          True
SibSp       False
Parch       False
Fare        False
Title       False
dtype: bool

'Test Nulls?'

Pclass    False
Sex       False
Age        True
SibSp     False
Parch     False
Fare       True
Title     False
dtype: bool

In [5]:
# Found nulls in train['Age'], test['Age'], and test['Fare']
display("Train Nulls")
display(train[train['Age'].isnull()].head())

display("Test Nulls")
display(test[test['Age'].isnull()].head())
display(test[test['Fare'].isnull()].head())

'Train Nulls'

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title
5,0,3,male,,0,0,8.4583,Mr.
17,1,2,male,,0,0,13.0,Mr.
19,1,3,female,,0,0,7.225,Mrs.
26,0,3,male,,0,0,7.225,Mr.
28,1,3,female,,0,0,7.8792,Miss.


'Test Nulls'

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title
10,3,male,,0,0,7.8958,Mr.
22,1,female,,0,0,31.6833,Mrs.
29,3,male,,2,0,21.6792,Mr.
33,3,female,,1,2,23.45,Mrs.
36,3,female,,0,0,8.05,Miss.


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title
152,3,male,60.5,0,0,,Mr.


### Discretize data

Only thing that needs to be discretized after dropping columns is 'Sex'

In [6]:
discretize = ['Sex', 'Title']

# Taken from module 3 notebook
def encode_onehot(_df, f):
    _df2 = pd.get_dummies(_df[f], prefix='', prefix_sep='').max(level=0, axis=1).add_prefix(f+' - ')
    df3 = pd.concat([_df, _df2], axis=1)
    df3 = df3.drop([f], axis=1)
    return df3

for dis in discretize:
    if dis in train:
        train = encode_onehot(train, dis)
        test = encode_onehot(test, dis)

In [7]:
# Test df now is missing columns

# Get the title columns
titles = list(train.columns) + list(test.columns)
titles = list(set(titles))
titles = [title for title in titles if title.startswith('Title')]
print(titles)

def add_titles(df, titles=titles):
    for title in titles:
        if title not in df.columns:
            df[title] = [0]*len(df)
            
    return df.reindex(sorted(df.columns), axis=1)

            
train = add_titles(train)
test = add_titles(test)
show_tables()

['Title - Capt.', 'Title - Major.', 'Title - Master.', 'Title - Col.', 'Title - Don.', 'Title - Mlle.', 'Title - Dr.', 'Title - Rev.', 'Title - Ms.', 'Title - Mrs.', 'Title - Sir.', 'Title - Mme.', 'Title - Dona.', 'Title - Jonkheer.', 'Title - Mr.', 'Title - Lady.', 'Title - Miss.', 'Title - Countess.']


Unnamed: 0,Age,Fare,Parch,Pclass,Sex - female,Sex - male,SibSp,Survived,Title - Capt.,Title - Col.,...,Title - Major.,Title - Master.,Title - Miss.,Title - Mlle.,Title - Mme.,Title - Mr.,Title - Mrs.,Title - Ms.,Title - Rev.,Title - Sir.
0,22.0,7.25,0,3,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,38.0,71.2833,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,26.0,7.925,0,3,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,35.0,53.1,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,35.0,8.05,0,3,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,,8.4583,0,3,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,54.0,51.8625,0,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,2.0,21.075,1,3,0,1,3,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,27.0,11.1333,2,3,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
9,14.0,30.0708,0,2,1,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0


Unnamed: 0,Age,Fare,Parch,Pclass,Sex - female,Sex - male,SibSp,Title - Capt.,Title - Col.,Title - Countess.,...,Title - Major.,Title - Master.,Title - Miss.,Title - Mlle.,Title - Mme.,Title - Mr.,Title - Mrs.,Title - Ms.,Title - Rev.,Title - Sir.
0,34.5,7.8292,0,3,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,47.0,7.0,0,3,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,62.0,9.6875,0,2,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,27.0,8.6625,0,3,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,22.0,12.2875,1,3,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,14.0,9.225,0,3,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,30.0,7.6292,0,3,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,26.0,29.0,1,2,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,18.0,7.2292,0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,21.0,24.15,0,3,0,1,2,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Impute missing values with k-Nearest Neighbors Imputer

In [8]:
# Impute nulls with KNN imputation
imputer = KNNImputer()

train_knn = imputer.fit_transform(train)
test_knn = imputer.fit_transform(test)

train_knn = pd.DataFrame(train_knn)
test_knn = pd.DataFrame(test_knn)

train_knn.columns = train.columns
test_knn.columns = test.columns

# check our work
display(train_knn.isnull().any())
display(test_knn.isnull().any())

# Show the current state of the table
display(train_knn.head(10))
display(test_knn.head(10))

Age                  False
Fare                 False
Parch                False
Pclass               False
Sex - female         False
Sex - male           False
SibSp                False
Survived             False
Title - Capt.        False
Title - Col.         False
Title - Countess.    False
Title - Don.         False
Title - Dona.        False
Title - Dr.          False
Title - Jonkheer.    False
Title - Lady.        False
Title - Major.       False
Title - Master.      False
Title - Miss.        False
Title - Mlle.        False
Title - Mme.         False
Title - Mr.          False
Title - Mrs.         False
Title - Ms.          False
Title - Rev.         False
Title - Sir.         False
dtype: bool

Age                  False
Fare                 False
Parch                False
Pclass               False
Sex - female         False
Sex - male           False
SibSp                False
Title - Capt.        False
Title - Col.         False
Title - Countess.    False
Title - Don.         False
Title - Dona.        False
Title - Dr.          False
Title - Jonkheer.    False
Title - Lady.        False
Title - Major.       False
Title - Master.      False
Title - Miss.        False
Title - Mlle.        False
Title - Mme.         False
Title - Mr.          False
Title - Mrs.         False
Title - Ms.          False
Title - Rev.         False
Title - Sir.         False
dtype: bool

Unnamed: 0,Age,Fare,Parch,Pclass,Sex - female,Sex - male,SibSp,Survived,Title - Capt.,Title - Col.,...,Title - Major.,Title - Master.,Title - Miss.,Title - Mlle.,Title - Mme.,Title - Mr.,Title - Mrs.,Title - Ms.,Title - Rev.,Title - Sir.
0,22.0,7.25,0.0,3.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,38.0,71.2833,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,26.0,7.925,0.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35.0,53.1,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,35.0,8.05,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,25.0,8.4583,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,54.0,51.8625,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,2.0,21.075,1.0,3.0,0.0,1.0,3.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,27.0,11.1333,2.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,14.0,30.0708,0.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,Age,Fare,Parch,Pclass,Sex - female,Sex - male,SibSp,Title - Capt.,Title - Col.,Title - Countess.,...,Title - Major.,Title - Master.,Title - Miss.,Title - Mlle.,Title - Mme.,Title - Mr.,Title - Mrs.,Title - Ms.,Title - Rev.,Title - Sir.
0,34.5,7.8292,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,47.0,7.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,62.0,9.6875,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,27.0,8.6625,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,22.0,12.2875,1.0,3.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,14.0,9.225,0.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,30.0,7.6292,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,26.0,29.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,18.0,7.2292,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,21.0,24.15,0.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Construct and Run Pipeline

In [15]:
# Prepare training set
x = train_knn.loc[:, train_knn.columns != 'Survived'].values
y = train_knn.loc[:, train_knn.columns == 'Survived'].values.ravel()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.7, random_state=None)

# Model Declaration
models = [RandomForestClassifier(n_estimators=100), DecisionTreeClassifier(), 
          SVC(), GaussianNB()]
accuracies = []
preds = {}

# Construct Pipeline and Train/Test
for model in models:
    pipe_lr = make_pipeline(StandardScaler(), model)
    pipe_lr.fit(x_train, y_train)
    y_pred = pipe_lr.predict(x_test).astype(np.int)

    accuracies.append((type(model), accuracy_score(y_test, y_pred)))
    preds[type(model)] = pipe_lr
    
accuracies.sort(key=lambda x: x[1], reverse=True)
display(accuracies)

y_pred = preds[accuracies[0][0]].predict(test_knn)

[(sklearn.svm._classes.SVC, 0.8333333333333334),
 (sklearn.ensemble._forest.RandomForestClassifier, 0.8108974358974359),
 (sklearn.tree._classes.DecisionTreeClassifier, 0.7355769230769231),
 (sklearn.naive_bayes.GaussianNB, 0.41346153846153844)]

## Export for Kaggle

In [10]:
def save_preds(_fn, _y_pred, _df):
    import csv
    with open(_fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['PassengerId', 'Survived'])
        for y, passengerId in zip(_y_pred, _df['PassengerId']):
            writer.writerow([passengerId, y])

save_preds('predictions_mcelhenney.csv', y_pred, test_ids)