## 1. Preprocess Data

In [1]:
# Imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set(style='ticks', color_codes=True) 

from os.path import isfile

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

# Load the datasets
root_path = './datasets/titanic/'

train_path = root_path + 'train.csv'
test_path = root_path + 'test.csv'
target_path = root_path + 'gender_submission.csv'

paths = [train_path, test_path, target_path]

for path in paths:
    if not isfile(path):
        print(f"Dataset {path} not found. Please check that the dataset exists and the path is correct.")
        
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
test_tar = pd.read_csv(target_path)
    
def show_tables():
    display(train.head())
    display(test.head())
    display(test_tar.head())
    
# Show the tables
show_tables()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [2]:
# Drop columns from both training and testing sets

# Name is useless
train.drop(columns=['Name'], inplace=True)
test.drop(columns=['Name'], inplace=True)

# Ticket is useless?
train.drop(columns=['Ticket'], inplace=True)
test.drop(columns=['Ticket'], inplace=True)

# Cabin is sparse but if it was not it would probably be very useful
# Might try re-adding it later
train.drop(columns=['Cabin'], inplace=True)
test.drop(columns=['Cabin'], inplace=True)

# Embarked is probably not useful either
train.drop(columns=['Embarked'], inplace=True)
test.drop(columns=['Embarked'], inplace=True)

# Show the adjusted tables
show_tables()

# Show types
display(train.dtypes)
display(test.dtypes)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,0,3,male,22.0,1,0,7.25
1,2,1,1,female,38.0,1,0,71.2833
2,3,1,3,female,26.0,0,0,7.925
3,4,1,1,female,35.0,1,0,53.1
4,5,0,3,male,35.0,0,0,8.05


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
0,892,3,male,34.5,0,0,7.8292
1,893,3,female,47.0,1,0,7.0
2,894,2,male,62.0,0,0,9.6875
3,895,3,male,27.0,0,0,8.6625
4,896,3,female,22.0,1,1,12.2875


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
dtype: object

PassengerId      int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
dtype: object

In [3]:
# Adapted from module 3 notebook
def dup_check(df):
    df['duplicate'] = df.duplicated()

    if len(df[df['duplicate'] == True]) > 0:
        print(df[df['duplicate'] == True])

        dups = df[df['duplicate'] == True].index
        df.drop(dups, inplace=True)

    df.drop(columns='duplicate', inplace=True)

# Run checks
display("Train Nulls?", train.isnull().any())
display("Test Nulls?", test.isnull().any())
display("Target Nulls?", test_tar.isnull().any())

dup_check(train)
dup_check(test)
dup_check(test_tar)

'Train Nulls?'

PassengerId    False
Survived       False
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare           False
dtype: bool

'Test Nulls?'

PassengerId    False
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare            True
dtype: bool

'Target Nulls?'

PassengerId    False
Survived       False
dtype: bool

In [4]:
# Found nulls in train['Age'], test['Age'], and test['Fare']
display("Train Nulls")
display(train[train['Age'].isnull()])

display("Test Nulls")
display(test[test['Age'].isnull()])
display(test[test['Fare'].isnull()])

'Train Nulls'

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
5,6,0,3,male,,0,0,8.4583
17,18,1,2,male,,0,0,13.0000
19,20,1,3,female,,0,0,7.2250
26,27,0,3,male,,0,0,7.2250
28,29,1,3,female,,0,0,7.8792
...,...,...,...,...,...,...,...,...
859,860,0,3,male,,0,0,7.2292
863,864,0,3,female,,8,2,69.5500
868,869,0,3,male,,0,0,9.5000
878,879,0,3,male,,0,0,7.8958


'Test Nulls'

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
10,902,3,male,,0,0,7.8958
22,914,1,female,,0,0,31.6833
29,921,3,male,,2,0,21.6792
33,925,3,female,,1,2,23.4500
36,928,3,female,,0,0,8.0500
...,...,...,...,...,...,...,...
408,1300,3,female,,0,0,7.7208
410,1302,3,female,,0,0,7.7500
413,1305,3,male,,0,0,8.0500
416,1308,3,male,,0,0,8.0500


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
152,1044,3,male,60.5,0,0,


In [5]:
# Impute the null values to the mean
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())

# Should display nothing
display(train[train['Age'].isnull()])
display(test[test['Age'].isnull()])
display(test[test['Fare'].isnull()])

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare


In [6]:
# Discritize Sex

# Taken from module 3 notebook
def encode_onehot(_df, f):
    _df2 = pd.get_dummies(_df[f], prefix='', prefix_sep='').max(level=0, axis=1).add_prefix(f+' - ')
    df3 = pd.concat([_df, _df2], axis=1)
    df3 = df3.drop([f], axis=1)
    return df3

train = encode_onehot(train, 'Sex')
test = encode_onehot(test, 'Sex')

# Show the current state of the table
display(train)
display(test)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex - female,Sex - male
0,1,0,3,22.000000,1,0,7.2500,0,1
1,2,1,1,38.000000,1,0,71.2833,1,0
2,3,1,3,26.000000,0,0,7.9250,1,0
3,4,1,1,35.000000,1,0,53.1000,1,0
4,5,0,3,35.000000,0,0,8.0500,0,1
...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,0,1
887,888,1,1,19.000000,0,0,30.0000,1,0
888,889,0,3,29.699118,1,2,23.4500,1,0
889,890,1,1,26.000000,0,0,30.0000,0,1


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex - female,Sex - male
0,892,3,34.50000,0,0,7.8292,0,1
1,893,3,47.00000,1,0,7.0000,1,0
2,894,2,62.00000,0,0,9.6875,0,1
3,895,3,27.00000,0,0,8.6625,0,1
4,896,3,22.00000,1,1,12.2875,1,0
...,...,...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500,0,1
414,1306,1,39.00000,0,0,108.9000,1,0
415,1307,3,38.50000,0,0,7.2500,0,1
416,1308,3,30.27259,0,0,8.0500,0,1


In [7]:
# Let's peek the correlations
display(train.corr())
display(test.corr())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex - female,Sex - male
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658,-0.042939,0.042939
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307,0.543351,-0.543351
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495,-0.1319,0.1319
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566,-0.084153,0.084153
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651,0.114631,-0.114631
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225,0.245489,-0.245489
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0,0.182333,-0.182333
Sex - female,-0.042939,0.543351,-0.1319,-0.084153,0.114631,0.245489,0.182333,1.0,-1.0
Sex - male,0.042939,-0.543351,0.1319,0.084153,-0.114631,-0.245489,-0.182333,-1.0,1.0


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex - female,Sex - male
PassengerId,1.0,-0.026751,-0.030874,0.003818,0.04308,0.008209,-0.023245,0.023245
Pclass,-0.026751,1.0,-0.440782,0.001087,0.018721,-0.576619,-0.108615,0.108615
Age,-0.030874,-0.440782,1.0,-0.079535,-0.045073,0.3268,-1.1e-05,1.1e-05
SibSp,0.003818,0.001087,-0.079535,1.0,0.306895,0.171488,0.099943,-0.099943
Parch,0.04308,0.018721,-0.045073,0.306895,1.0,0.230001,0.15912,-0.15912
Fare,0.008209,-0.576619,0.3268,0.171488,0.230001,1.0,0.191382,-0.191382
Sex - female,-0.023245,-0.108615,-1.1e-05,0.099943,0.15912,0.191382,1.0,-1.0
Sex - male,0.023245,0.108615,1.1e-05,-0.099943,-0.15912,-0.191382,-1.0,1.0


In [8]:
# Prepare training set
x_train = train.loc[:, train.columns != 'Survived'].values
y_train = train.loc[:, train.columns == 'Survived'].values.ravel()

# Prepare test set
y_test = test_tar.loc[:, test_tar.columns == 'Survived'].values.ravel()

# Model Declaration
model = RandomForestClassifier(n_estimators=1000)

# Construct Pipeline and Train/Test
pipe_lr = make_pipeline(StandardScaler(), model)
pipe_lr.fit(x_train, y_train)
y_pred = pipe_lr.predict(test)

print(confusion_matrix(y_test, y_pred).ravel())
print(accuracy_score(y_test, y_pred))

[242  24  58  94]
0.8038277511961722
