In [18]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from env import host, user, password

import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from graphviz import Graph

In [2]:
def get_titanic_data(host = host, user = user, password = password):
    db = 'titanic_db'
    return pd.read_sql('SELECT * FROM passengers', f'mysql+pymysql://{user}:{password}@{host}/{db}')

In [3]:
titanic = get_titanic_data()

In [4]:
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [5]:
# What is your baseline prediction?
# Based on the numbers baseline should be that passengers did not survive

titanic.groupby('survived').count()

Unnamed: 0_level_0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,549,549,549,424,549,549,549,549,549,67,549,549
1,342,342,342,290,342,342,342,340,342,136,340,342


In [6]:
titanic['baseline'] = 0

In [7]:
549 / (549 + 342)

0.6161616161616161

In [8]:
titanic['sex'] = (titanic['sex'] == 'male')

In [9]:
titanic.drop(columns = ['embark_town'], inplace = True)

In [10]:
# Create dummy variables of the species name.
titanic.dropna(how = 'any', inplace = True)
dummies_deck = pd.get_dummies(titanic[['deck']])
dummies_embarked = pd.get_dummies(titanic[['embarked']])
dummies_class = pd.get_dummies(titanic[['class']])
titanic.drop(columns = ['deck', 'embarked', 'class'], inplace = True)


In [11]:
titanic = pd.concat([titanic, dummies_deck, dummies_embarked, dummies_class], axis=1)

In [12]:
# What is your baseline accuracy?
# 61.62%
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,baseline,...,deck_D,deck_E,deck_F,deck_G,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
1,1,1,1,False,38.0,1,0,71.2833,0,0,...,0,0,0,0,1,0,0,1,0,0
3,3,1,1,False,35.0,1,0,53.1000,0,0,...,0,0,0,0,0,0,1,1,0,0
6,6,0,1,True,54.0,0,0,51.8625,1,0,...,0,1,0,0,0,0,1,1,0,0
10,10,1,3,False,4.0,1,1,16.7000,0,0,...,0,0,0,1,0,0,1,0,0,1
11,11,1,1,False,58.0,0,0,26.5500,1,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,871,1,1,False,47.0,1,1,52.5542,0,0,...,1,0,0,0,0,0,1,1,0,0
872,872,0,1,True,33.0,0,0,5.0000,1,0,...,0,0,0,0,0,0,1,1,0,0
879,879,1,1,False,56.0,0,1,83.1583,0,0,...,0,0,0,0,1,0,0,1,0,0
887,887,1,1,False,19.0,0,0,30.0000,1,0,...,0,0,0,0,0,0,1,1,0,0


In [13]:
# Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train['survived'])

In [14]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,baseline,...,deck_D,deck_E,deck_F,deck_G,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
429,429,1,3,True,32.0,0,0,8.05,1,0,...,0,1,0,0,0,0,1,0,0,1
679,679,1,1,True,36.0,0,1,512.3292,0,0,...,0,0,0,0,1,0,0,1,0,0
118,118,0,1,True,24.0,0,1,247.5208,0,0,...,0,0,0,0,1,0,0,1,0,0
435,435,1,1,False,14.0,1,2,120.0,0,0,...,0,0,0,0,0,0,1,1,0,0
782,782,0,1,True,29.0,0,0,30.0,1,0,...,1,0,0,0,0,0,1,1,0,0
209,209,1,1,True,40.0,0,0,31.0,1,0,...,0,0,0,0,1,0,0,1,0,0
137,137,0,1,True,37.0,1,0,53.1,0,0,...,0,0,0,0,0,0,1,1,0,0
337,337,1,1,False,41.0,0,0,134.5,1,0,...,0,1,0,0,1,0,0,1,0,0
577,577,1,1,False,39.0,1,0,55.9,0,0,...,0,1,0,0,0,0,1,1,0,0
641,641,1,1,False,24.0,0,0,69.3,1,0,...,0,0,0,0,1,0,0,1,0,0


In [15]:
x_train = train.drop(columns = ['survived'])
y_train = train.survived

x_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

x_test = test.drop(columns = ['survived'])
y_test = test.survived

In [16]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [17]:
clf = clf.fit(x_train, y_train)


In [19]:
dot_data = export_graphviz(clf, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('tips_decision_tree', view=True, format="pdf")

'tips_decision_tree.pdf'