In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from env import host, user, password

import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from graphviz import Graph

In [2]:
def get_titanic_data(host = host, user = user, password = password):
    db = 'titanic_db'
    return pd.read_sql('SELECT * FROM passengers', f'mysql+pymysql://{user}:{password}@{host}/{db}')

In [3]:
titanic = get_titanic_data()

In [4]:
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [5]:
# What is your baseline prediction?
# Based on the numbers baseline should be that passengers did not survive

titanic.groupby('survived').count()

Unnamed: 0_level_0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,549,549,549,424,549,549,549,549,549,67,549,549
1,342,342,342,290,342,342,342,340,342,136,340,342


In [6]:
titanic['baseline'] = 0

In [7]:
549 / (549 + 342)

0.6161616161616161

In [8]:
titanic.isna().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
baseline          0
dtype: int64

In [9]:
titanic['sex'] = (titanic['sex'] == 'female')

In [10]:
titanic.drop(columns = ['embark_town'], inplace = True)

In [11]:
# Create dummy variables of the species name.
titanic.drop(columns = ['deck'], inplace = True)
titanic.dropna(how = 'any', inplace = True)
dummies_embarked = pd.get_dummies(titanic[['embarked']])
dummies_class = pd.get_dummies(titanic[['class']])
titanic.drop(columns = ['embarked', 'passenger_id', 'class'], inplace = True)


In [12]:
titanic = pd.concat([titanic, dummies_embarked, dummies_class], axis=1)

In [13]:
# What is your baseline accuracy?
# 61.62%
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,alone,baseline,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
0,0,3,False,22.0,1,0,7.2500,0,0,0,0,1,0,0,1
1,1,1,True,38.0,1,0,71.2833,0,0,1,0,0,1,0,0
2,1,3,True,26.0,0,0,7.9250,1,0,0,0,1,0,0,1
3,1,1,True,35.0,1,0,53.1000,0,0,0,0,1,1,0,0
4,0,3,False,35.0,0,0,8.0500,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,True,39.0,0,5,29.1250,0,0,0,1,0,0,0,1
886,0,2,False,27.0,0,0,13.0000,1,0,0,0,1,0,1,0
887,1,1,True,19.0,0,0,30.0000,1,0,0,0,1,1,0,0
889,1,1,False,26.0,0,0,30.0000,1,0,1,0,0,1,0,0


In [14]:
# Fit the decision tree classifier to your training sample and transform 
# (i.e. make predictions on the training sample)
train, test = train_test_split(titanic, test_size=.2, random_state=123, stratify=titanic['survived'])
train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train['survived'])

In [15]:
train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,alone,baseline,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third
450,0,2,False,36.0,1,2,27.75,0,0,0,0,1,0,1,0
543,1,2,False,32.0,1,0,26.0,0,0,0,0,1,0,1,0
157,0,3,False,30.0,0,0,8.05,1,0,0,0,1,0,0,1
462,0,1,False,47.0,0,0,38.5,1,0,0,0,1,1,0,0
397,0,2,False,46.0,0,0,26.0,1,0,0,0,1,0,1,0


In [16]:
x_train = train.drop(columns = ['survived'])
y_train = train.survived

x_validate = validate.drop(columns = ['survived'])
y_validate = validate.survived

x_test = test.drop(columns = ['survived'])
y_test = test.survived

In [17]:
clf3 = DecisionTreeClassifier(max_depth=3, random_state=123)
clf3 = clf3.fit(x_train, y_train)
y3_pred = clf3.predict(x_train)

In [18]:
dot_data = export_graphviz(clf3, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('tips_decision_tree', view=True, format="pdf")

'tips_decision_tree.pdf'

In [19]:
# Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, 
# precision, recall, f1-score, and support.


# True positive: survived predicted survived
# True negative: drown predicted drown
# False positive: drown predicted survived
# False negative: survived predicted drown

In [20]:
confusion_matrix(y_train, y3_pred)

array([[230,   7],
       [ 56, 105]])

In [21]:
print(classification_report(y_train, y3_pred))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88       237
           1       0.94      0.65      0.77       161

    accuracy                           0.84       398
   macro avg       0.87      0.81      0.82       398
weighted avg       0.86      0.84      0.83       398



In [22]:
clf4 = DecisionTreeClassifier(max_depth=4, random_state=123)
clf4 = clf4.fit(x_train, y_train)
y4_pred = clf4.predict(x_train)

In [23]:
confusion_matrix(y_train, y4_pred)

array([[231,   6],
       [ 55, 106]])

In [24]:
print(classification_report(y_train, y4_pred))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88       237
           1       0.95      0.66      0.78       161

    accuracy                           0.85       398
   macro avg       0.88      0.82      0.83       398
weighted avg       0.86      0.85      0.84       398



In [25]:
y3_val = clf3.predict(x_validate)
print(classification_report(y_validate, y3_val))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84       102
           1       0.88      0.52      0.65        69

    accuracy                           0.78       171
   macro avg       0.81      0.74      0.75       171
weighted avg       0.80      0.78      0.76       171



In [26]:
y4_val = clf4.predict(x_validate)
print(classification_report(y_validate, y4_val))

              precision    recall  f1-score   support

           0       0.74      0.94      0.83       102
           1       0.86      0.52      0.65        69

    accuracy                           0.77       171
   macro avg       0.80      0.73      0.74       171
weighted avg       0.79      0.77      0.76       171



In [27]:
clf5 = DecisionTreeClassifier(max_depth=5, random_state=123)
clf5 = clf5.fit(x_train, y_train)
y5_pred = clf5.predict(x_train)

In [28]:
print(classification_report(y_train, y5_pred))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       237
           1       0.89      0.74      0.81       161

    accuracy                           0.86       398
   macro avg       0.87      0.84      0.85       398
weighted avg       0.86      0.86      0.86       398



In [29]:
y5_val = clf5.predict(x_validate)
print(classification_report(y_validate, y5_val))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       102
           1       0.80      0.62      0.70        69

    accuracy                           0.78       171
   macro avg       0.79      0.76      0.77       171
weighted avg       0.79      0.78      0.78       171



In [30]:
dot_data = export_graphviz(clf4, feature_names= x_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('tips_decision_tree', view=True, format="pdf")

'tips_decision_tree.pdf'

In [None]:
# Which model performs better on your in-sample data?
# for in sample data adjusting the max depth is always going to lead to a higher in sample accuracy

In [None]:
# Which model performs best on your out-of-sample data, the validate set?
# A max depth of 5 resulted in the most accurate model when used with the validate data set.