In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import operator
import math
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing, tree, metrics
from sklearn.model_selection import KFold
from sklearn.base import clone
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import sklearn

In [2]:
# Access the data
test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

## Data Cleaning 

In [3]:
train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


- Survived : Whether they survived
- Pclass : Ticket class
- Sex : Sex
- Age: Age in years
- SibSp: Number of siblings / spouses aboard the Titanic
- Parch: Number of parents / children aboard the Titanic
- Ticket: Ticket number
- Fare: Passenger fare
- Cabin: Cabin number
- Embarked: Part of Embarkation, C = Cherbourg, Q = Queenstown, S = Southampton



In [5]:
print("Training Data")
print("Shape: ", train_data.shape)
print("__________________________________________")
print("\nNumber of missing values: ")
train_data.isna().sum()

Training Data
Shape:  (891, 12)
__________________________________________

Number of missing values: 


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
print("Testing Data")
print("Shape: ", test_data.shape)
print("__________________________________________")
print("\nNumber of missing values: ")
test_data.isna().sum()

Testing Data
Shape:  (418, 11)
__________________________________________

Number of missing values: 


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

### Lets drop cabin since it has too many missing values for both datasets

In [7]:
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

In [8]:
train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


# Exploratory Data Analysis

### Predicting people's age

In [9]:
train_data[['Name','Age']].head(10)

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0
5,"Moran, Mr. James",
6,"McCarthy, Mr. Timothy J",54.0
7,"Palsson, Master. Gosta Leonard",2.0
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0


### Getting people's titles

In [10]:
# Use regex to get people's titles from their names
peoples_titles = train_data['Name'].str.extract(r' (\w+)\. ').to_numpy()
train_data['Title'] = peoples_titles

peoples_titles_test = test_data['Name'].str.extract(r' (\w+)\. ').to_numpy()
test_data['Title'] = peoples_titles_test


In [11]:
# Unique Titles
train_data[['Title']].value_counts()
# Unique Titles from people with missing ages
# train_data[(train_data['Age'].isna())][['Title']].value_counts()

Title   
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Col           2
Mlle          2
Mme           1
Ms            1
Capt          1
Lady          1
Jonkheer      1
Don           1
Countess      1
Sir           1
dtype: int64

In [12]:
# Unique Titles from test set
test_data[['Title']].value_counts()
# Unique Titles from people with missing ages in test set
# test_data[(test_data['Age'].isna())][['Title']].value_counts()



Title 
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dona        1
Dr          1
Ms          1
dtype: int64

In [13]:
# Seeing the Ages from different Titles
# train_data[train_data['Title']=='Master']['Age'].dropna().sort_values()

In [14]:
# Seeing the Ages from different Titles for test dataset
# test_data[test_data['Title']=='Master']['Age'].dropna().sort_values()

In [15]:
# The median Ages of people with titles
Master_Median = np.median(train_data[train_data['Title']=='Master']['Age'].dropna().sort_values())
Mr_Median = np.median(train_data[train_data['Title']=='Mr']['Age'].dropna().sort_values())
Miss_Median = np.median(train_data[train_data['Title']=='Miss']['Age'].dropna().sort_values())
Mrs_Median = np.median(train_data[train_data['Title']=='Mrs']['Age'].dropna().sort_values())
Dr_Median = np.median(train_data[train_data['Title']=='Dr']['Age'].dropna().sort_values())

In [16]:
# Setting the missing values in the training age data to the median ages found above
train_data.loc[((train_data['Title'] == 'Master')&(train_data['Age'].isna())), 'Age'] = Master_Median
train_data.loc[((train_data['Title'] == 'Mr')&(train_data['Age'].isna())), 'Age'] = Mr_Median
train_data.loc[((train_data['Title'] == 'Miss')&(train_data['Age'].isna())), 'Age'] = Miss_Median
train_data.loc[((train_data['Title'] == 'Mrs')&(train_data['Age'].isna())), 'Age'] = Mrs_Median
train_data.loc[((train_data['Title'] == 'Dr')&(train_data['Age'].isna())), 'Age'] = Dr_Median

In [17]:
# The median Ages of people with titles for test_data
Master_Median_test = np.median(test_data[test_data['Title']=='Master']['Age'].dropna().sort_values())
Mr_Median_test = np.median(test_data[test_data['Title']=='Mr']['Age'].dropna().sort_values())
Miss_Median_test = np.median(test_data[test_data['Title']=='Miss']['Age'].dropna().sort_values())
Mrs_Median_test = np.median(test_data[test_data['Title']=='Mrs']['Age'].dropna().sort_values())
# Only one value for Ms and age is missing so we will use Miss
# Ms_Median_test = np.median(test_data[test_data['Title']=='Ms']['Age'].dropna().sort_values())

In [18]:
# Setting the missing values in the test age data to the median ages found above
test_data.loc[((test_data['Title'] == 'Master')&(test_data['Age'].isna())), 'Age'] = Master_Median_test
test_data.loc[((test_data['Title'] == 'Mr')&(test_data['Age'].isna())), 'Age'] = Mr_Median_test
test_data.loc[((test_data['Title'] == 'Miss')&(test_data['Age'].isna())), 'Age'] = Miss_Median_test
test_data.loc[((test_data['Title'] == 'Mrs')&(test_data['Age'].isna())), 'Age'] = Mrs_Median_test
test_data.loc[((test_data['Title'] == 'Ms')&(test_data['Age'].isna())), 'Age'] = Miss_Median_test


In [19]:
train_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
Title          0
dtype: int64

In [20]:
test_data.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
Title          0
dtype: int64

### Now That we have predicted all the ages, lets make sure we get rid of all NA values
For both data sets

### Train Data

In [21]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [22]:
# S is the location where most people embarked so we will use this value (mode)
train_data.loc[(train_data['Embarked'].isna()), 'Embarked'] = 'S'

### Test Data

In [23]:
test_data['Fare'].value_counts()

7.7500      21
26.0000     19
8.0500      17
13.0000     17
7.8958      11
            ..
31.6833      1
16.0000      1
53.1000      1
146.5208     1
20.2500      1
Name: Fare, Length: 169, dtype: int64

In [24]:
# We will use the median Fare to fill the na value
test_data.loc[(test_data['Fare'].isna()), 'Fare'] = np.median(test_data['Fare'].dropna())

In [25]:
# Checking to see if we have any more missing values
sum(train_data.isna().sum()), sum(test_data.isna().sum())

(0, 0)

### Lets now prepare our data for modeling

In [26]:
# Dropping the name, passengerId, and ticket number column
train_data.drop(labels=['Name', 'PassengerId'], axis=1, inplace=True)  #edited!!!! took ticket away!!
test_data.drop(labels=['Name'], axis=1, inplace=True)

In [27]:
# Replacing the male/female with 1/0 respectively
train_data['Sex'].replace({'male':1, 'female':0}, inplace=True)
test_data['Sex'].replace({'male':1, 'female':0}, inplace=True)

In [28]:
# Now we have to turn 'Pclass', 'Embarked', and 'Title' into one hot encodings 
# because they are all categorical features
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22.0,1,0,7.2500,S,Mr
1,1,1,0,38.0,1,0,71.2833,C,Mrs
2,1,3,0,26.0,0,0,7.9250,S,Miss
3,1,1,0,35.0,1,0,53.1000,S,Mrs
4,0,3,1,35.0,0,0,8.0500,S,Mr
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,S,Rev
887,1,1,0,19.0,0,0,30.0000,S,Miss
888,0,3,0,21.0,1,2,23.4500,S,Miss
889,1,1,1,26.0,0,0,30.0000,C,Mr


In [29]:
# Lets decrease the values in Title to not include the categories that only have a few people
train_data['Title'] = train_data['Title'].replace(['Mme','Rev','Major','Col','Mlle','Ms','Capt','Lady','Jonkheer','Don','Countess', 'Sir'], 'other')
test_data['Title'] = test_data['Title'].replace(['Dona','Mme','Rev','Major','Col','Mlle','Ms','Capt','Lady','Jonkheer','Don','Countess', 'Sir'], 'other')


In [30]:
# Creating our one hot encoder
ohe = OneHotEncoder()
ohe_test = OneHotEncoder()

In [31]:
# Passing our data into our ohe and getting the values
ohe_feature_array = ohe.fit_transform(train_data[['Embarked', 'Title']]).toarray()
ohe_feature_array_test = ohe_test.fit_transform(test_data[['Embarked', 'Title']]).toarray()

In [32]:
# ohe.categories_

In [33]:
# Getting the feature labels
ohe_feature_labels = ohe.categories_
ohe_feature_labels = np.concatenate((ohe_feature_labels))

ohe_feature_labels_test = ohe_test.categories_
ohe_feature_labels_test = np.concatenate((ohe_feature_labels_test))

print(ohe_feature_labels)
print(ohe_feature_labels_test)


['C' 'Q' 'S' 'Dr' 'Master' 'Miss' 'Mr' 'Mrs' 'other']
['C' 'Q' 'S' 'Dr' 'Master' 'Miss' 'Mr' 'Mrs' 'other']


In [34]:
# Creating our ohe values dataframe
ohe_features = pd.DataFrame(ohe_feature_array, columns=ohe_feature_labels).astype(int)
ohe_features_test = pd.DataFrame(ohe_feature_array_test, columns=ohe_feature_labels_test).astype(int)

In [35]:
# combining our training set and ohe values set
train_data = pd.concat([train_data, ohe_features], axis=1)
test_data = pd.concat([test_data, ohe_features_test], axis=1)

In [36]:
# droping the old categorical columns
train_data.drop(labels=['Embarked', 'Title'], axis=1, inplace=True)
test_data.drop(labels=['Embarked', 'Title'], axis=1, inplace=True)

In [37]:
train_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Dr,Master,Miss,Mr,Mrs,other
0,0,3,1,22.0,1,0,7.25,0,0,1,0,0,0,1,0,0
1,1,1,0,38.0,1,0,71.2833,1,0,0,0,0,0,0,1,0
2,1,3,0,26.0,0,0,7.925,0,0,1,0,0,1,0,0,0


In [38]:
test_data.head(3)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Dr,Master,Miss,Mr,Mrs,other
0,892,3,1,34.5,0,0,7.8292,0,1,0,0,0,0,1,0,0
1,893,3,0,47.0,1,0,7.0,0,0,1,0,0,0,0,1,0
2,894,2,1,62.0,0,0,9.6875,0,1,0,0,0,0,1,0,0


In [39]:
# Lets add a new column that measures the amount of family a person has with them
train_data['Total_Fam'] = train_data['SibSp'] + train_data['Parch']
test_data['Total_Fam'] = test_data['SibSp'] + test_data['Parch']

# Lets add a new column that measures the product of Pclass and Fare
train_data['Pclass*Fare'] = train_data['Pclass'] + train_data['Fare']
test_data['Pclass*Fare'] = test_data['Pclass'] + test_data['Fare']

# Now time to normalize our numerical values such as Age, SibSp, Parch, and Fare, Pclass*Fare, Total_Fam
# Using z-score
def normalize(train_data, test_data):
    train_data['Age'] = (train_data['Age']-np.mean(train_data['Age']))/np.std(train_data['Age'])
    train_data['SibSp'] = (train_data['SibSp']-np.mean(train_data['SibSp']))/np.std(train_data['SibSp'])
    train_data['Parch'] = (train_data['Parch']-np.mean(train_data['Parch']))/np.std(train_data['Parch'])
    train_data['Fare'] = (train_data['Fare']-np.mean(train_data['Fare']))/np.std(train_data['Fare'])
    train_data['Pclass*Fare'] = (train_data['Pclass*Fare']-np.mean(train_data['Pclass*Fare']))/np.std(train_data['Pclass*Fare'])
    train_data['Total_Fam'] = (train_data['Total_Fam']-np.mean(train_data['Total_Fam']))/np.std(train_data['Total_Fam'])

    test_data['Age'] = (test_data['Age']-np.mean(test_data['Age']))/np.std(test_data['Age'])
    test_data['SibSp'] = (test_data['SibSp']-np.mean(test_data['SibSp']))/np.std(test_data['SibSp'])
    test_data['Parch'] = (test_data['Parch']-np.mean(test_data['Parch']))/np.std(test_data['Parch'])
    test_data['Fare'] = (test_data['Fare']-np.mean(test_data['Fare']))/np.std(test_data['Fare'])
    test_data['Pclass*Fare'] = (test_data['Pclass*Fare']-np.mean(test_data['Pclass*Fare']))/np.std(test_data['Pclass*Fare'])
    test_data['Total_Fam'] = (test_data['Total_Fam']-np.mean(test_data['Total_Fam']))/np.std(test_data['Total_Fam'])


In [40]:
# Our final clean and normalized training dataset
train_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Dr,Master,Miss,Mr,Mrs,other,Total_Fam,Pclass*Fare
0,0,3,1,22.0,1,0,7.25,0,0,1,0,0,0,1,0,0,1,10.25
1,1,1,0,38.0,1,0,71.2833,1,0,0,0,0,0,0,1,0,1,72.2833
2,1,3,0,26.0,0,0,7.925,0,0,1,0,0,1,0,0,0,0,10.925


In [41]:
test_data.head(3)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S,Dr,Master,Miss,Mr,Mrs,other,Total_Fam,Pclass*Fare
0,892,3,1,34.5,0,0,7.8292,0,1,0,0,0,0,1,0,0,0,10.8292
1,893,3,0,47.0,1,0,7.0,0,0,1,0,0,0,0,1,0,1,10.0
2,894,2,1,62.0,0,0,9.6875,0,1,0,0,0,0,1,0,0,0,11.6875
