Decision Trees

In [1]:
#Initial Imports

import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
#Loading data

file_path = Path('../Resources/data.csv')
df_titanic = pd.read_csv(file_path, encoding = "ISO-8859-1")
df_titanic.head()

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,United States,5547.0,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,United States,2673.0,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,United States,2673.0,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,England,2673.0,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,Norway,348125.0,7.13,0.0,0.0,yes


In [13]:
#Convert Class to numeric
mapping = {'victualling crew': 'crew', 'engineering crew': 'crew', 'restaurant staff': 'crew', 'deck crew': 'crew'}
df_t = df_titanic.replace({'class' : mapping})
df_t.head()

Unnamed: 0,name,gender,age,class,embarked,country,ticketno,fare,sibsp,parch,survived
0,"Abbing, Mr. Anthony",male,42.0,3rd,S,United States,5547.0,7.11,0.0,0.0,no
1,"Abbott, Mr. Eugene Joseph",male,13.0,3rd,S,United States,2673.0,20.05,0.0,2.0,no
2,"Abbott, Mr. Rossmore Edward",male,16.0,3rd,S,United States,2673.0,20.05,1.0,1.0,no
3,"Abbott, Mrs. Rhoda Mary 'Rosa'",female,39.0,3rd,S,England,2673.0,20.05,1.0,1.0,yes
4,"Abelseth, Miss. Karen Marie",female,16.0,3rd,S,Norway,348125.0,7.13,0.0,0.0,yes


# Drop Columns that are not needed. 
- Name is not needed so we will drop that.
- Ticket Number won't add value to our analysis so we will drop that as well.

In [14]:
#Drop Unnecessary Columns
df_t = df_t.drop(columns=['name', 'ticketno'])
df_t.head()

Unnamed: 0,gender,age,class,embarked,country,fare,sibsp,parch,survived
0,male,42.0,3rd,S,United States,7.11,0.0,0.0,no
1,male,13.0,3rd,S,United States,20.05,0.0,2.0,no
2,male,16.0,3rd,S,United States,20.05,1.0,1.0,no
3,female,39.0,3rd,S,England,20.05,1.0,1.0,yes
4,female,16.0,3rd,S,Norway,7.13,0.0,0.0,yes


# Adjust gender column
## Keep 'gender' column, but change the strings 'female' and 'male' to numerical 0 and 1, to allow machine learning to work on this column.

In [15]:
gender = {'female': 0, 'male': 1}
# df_t = df_t.replace({'gender' : gender})
df_t.replace({'gender': gender}, inplace=True)
df_t.head()

Unnamed: 0,gender,age,class,embarked,country,fare,sibsp,parch,survived
0,1,42.0,3rd,S,United States,7.11,0.0,0.0,no
1,1,13.0,3rd,S,United States,20.05,0.0,2.0,no
2,1,16.0,3rd,S,United States,20.05,1.0,1.0,no
3,0,39.0,3rd,S,England,20.05,1.0,1.0,yes
4,0,16.0,3rd,S,Norway,7.13,0.0,0.0,yes


# Adjust survived column
## Keep 'survived' column, but much like 'gender', change the strings 'no' and 'yes' to numerical 0 and 1, to allow machine learning to work on this column as well.

In [16]:
survival = {'no': 0, 'yes': 1}
df_t.replace({'survived': survival}, inplace=True)
df_t.head()

Unnamed: 0,gender,age,class,embarked,country,fare,sibsp,parch,survived
0,1,42.0,3rd,S,United States,7.11,0.0,0.0,0
1,1,13.0,3rd,S,United States,20.05,0.0,2.0,0
2,1,16.0,3rd,S,United States,20.05,1.0,1.0,0
3,0,39.0,3rd,S,England,20.05,1.0,1.0,1
4,0,16.0,3rd,S,Norway,7.13,0.0,0.0,1


# Split Countries into groups based on geographical region
## We will keep any countries with 50 or more passengers, and group all countries with less than 50 passengers into the following regions:
- Europe: 'EUR'
- Asia: 'ASA'
- Australia/Oceania: 'AUS'
- North America: 'NAM'
- South America: 'SAM'
- Africa: 'AFR'

## Note the countries that remain as is (>=50 Passengers) are:
- England: 'ENG'
- United States: 'USA'
- Ireland: 'IRL'
- Sweden: 'SWE'
- Lebanon: 'LBN'
- Finland: 'FIN'

## There are then a total of 12 country codes.

In [22]:
countrycode = {'England': 'ENG', 
'United States': 'USA', 
'Ireland': 'IRL', 
'Sweden': 'SWE', 
'Lebanon': 'LBN', 
'Finland': 'FIN', 
'Scotland': 'EUR', 
'Canada': 'NAM', 
'France': 'EUR', 
'Norway': 'EUR', 
'Belgium': 'EUR', 
'Northern Ireland': 'EUR', 
'Wales': 'EUR', 
'Bulgaria': 'EUR', 
'Switzerland': 'EUR', 
'Channel Islands': 'EUR', 
'Croatia (Modern)': 'EUR', 
'Croatia': 'EUR', 
'Italy': 'EUR', 
'Spain': 'EUR', 
'India': 'ASA', 
'Argentina': 'SAM', 
'Hungary': 'EUR', 
'Denmark': 'EUR', 
'Turkey': 'ASA', 
'Germany': 'EUR', 
'South Africa': 'AFR', 
'Australia': 'AUS', 
'Bosnia': 'EUR', 
'Slovenia': 'EUR', 
'Poland': 'EUR', 
'Austria': 'EUR', 
'Greece': 'EUR', 
'Netherlands': 'EUR', 
'Uruguay': 'SAM', 
'Peru': 'SAM', 
'Russia': 'EUR', 
'Siam': 'ASA', 
'Syria': 'ASA', 
'Japan': 'ASA', 
'Slovakia (Modern day)': 'EUR', 
'Mexico': 'NAM', 
'Latvia': 'EUR', 
'Guyana': 'SAM', 
'Egypt': 'AFR', 
'Cuba': 'NAM', 
'China/Hong Kong': 'ASA', 
'Yugoslavia': 'EUR'}
df_t.replace({'country': countrycode}, inplace=True)
df_t.head()

Unnamed: 0,gender,age,class,embarked,country,fare,sibsp,parch,survived
0,1,42.0,3rd,S,USA,7.11,0.0,0.0,0
1,1,13.0,3rd,S,USA,20.05,0.0,2.0,0
2,1,16.0,3rd,S,USA,20.05,1.0,1.0,0
3,0,39.0,3rd,S,ENG,20.05,1.0,1.0,1
4,0,16.0,3rd,S,EUR,7.13,0.0,0.0,1


In [23]:
df_t.dtypes

gender        int64
age         float64
class        object
embarked     object
country      object
fare        float64
sibsp       float64
parch       float64
survived      int64
dtype: object

In [24]:
df_t = pd.get_dummies(df_t)
df_t.head()

Unnamed: 0,gender,age,fare,sibsp,parch,survived,class_1st,class_2nd,class_3rd,class_crew,...,country_AUS,country_ENG,country_EUR,country_FIN,country_IRL,country_LBN,country_NAM,country_SAM,country_SWE,country_USA
0,1,42.0,7.11,0.0,0.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1,13.0,20.05,0.0,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,1,16.0,20.05,1.0,1.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,39.0,20.05,1.0,1.0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,0,16.0,7.13,0.0,0.0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
#Define features set

In [None]:
#Define target vector

In [None]:
#Splitting into Train and Test sets