In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import os
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import acquire1
import env

import prepare

import graphviz
from graphviz import Graph

In [2]:
train, validate, test = prepare.prep_titanic_data(acquire1.get_titanic_data())

In [3]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,sibsp,parch,fare,embark_town,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
455,455,1,3,male,0,0,7.8958,Cherbourg,1,0,1,1,0,0
380,380,1,1,female,0,0,227.525,Cherbourg,1,1,0,1,0,0
492,492,0,1,male,0,0,30.5,Southampton,1,0,1,0,0,1
55,55,1,1,male,0,0,35.5,Southampton,1,0,1,0,0,1
243,243,0,3,male,0,0,7.125,Southampton,1,0,1,0,0,1


In [4]:
train.drop(columns=['sex', 'embark_town'], inplace=True)
validate.drop(columns=['sex', 'embark_town'], inplace=True)
test.drop(columns=['sex', 'embark_town'], inplace=True)

In [5]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
455,455,1,3,0,0,7.8958,1,0,1,1,0,0
380,380,1,1,0,0,227.525,1,1,0,1,0,0
492,492,0,1,0,0,30.5,1,0,1,0,0,1
55,55,1,1,0,0,35.5,1,0,1,0,0,1
243,243,0,3,0,0,7.125,1,0,1,0,0,1


What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [6]:
#create baseline
train['baseline']= 0

In [7]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,baseline
455,455,1,3,0,0,7.8958,1,0,1,1,0,0,0
380,380,1,1,0,0,227.525,1,1,0,1,0,0,0
492,492,0,1,0,0,30.5,1,0,1,0,0,1,0
55,55,1,1,0,0,35.5,1,0,1,0,0,1,0
243,243,0,3,0,0,7.125,1,0,1,0,0,1,0


In [8]:
#check accuracy of baseline
accuracy = (train.survived == train.baseline).mean()
accuracy

0.6161048689138576

2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [9]:
#create the object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [10]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [11]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [12]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, class_names=('survived', 'died'), rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [13]:
#finding model score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.81


In [14]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 0, 0])

In [15]:
#create a confusion matrix
confusion_matrix(y_train, y_pred)

array([[293,  36],
       [ 63, 142]])

In [16]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,293,36
1,63,142


In [17]:
#print classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.86       329
           1       0.80      0.69      0.74       205

    accuracy                           0.81       534
   macro avg       0.81      0.79      0.80       534
weighted avg       0.81      0.81      0.81       534



4. Compute: Accuracy, true positive rate(recall), false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [18]:
#create a confusion matrix
confusion_matrix(y_train, y_pred)

array([[293,  36],
       [ 63, 142]])

Run through steps 2-4 using a different max_depth value.

2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [19]:
#create the object
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

In [20]:
# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [21]:
#create decision tree .pdf
dot_data = export_graphviz(clf, feature_names= X_train.columns, class_names=('survived', 'died'), rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
#finding model score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [22]:
#clf.predict(X_train) uses the model to create a prediction for each row in the X_train dataframe and each prediction is
# based on the specific features and their values in that given row...one prediction for each row in that dataframe
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 1, 0, 0, 0])

In [23]:
#create a confusion matrix
confusion_matrix(y_train, y_pred)

array([[296,  33],
       [ 60, 145]])

In [24]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,296,33
1,60,145


In [25]:
#print classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       329
           1       0.81      0.71      0.76       205

    accuracy                           0.83       534
   macro avg       0.82      0.80      0.81       534
weighted avg       0.82      0.83      0.82       534



Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.(See above)

6. Which model performs better on your in-sample data?
The second model with depth of 4 performs better.

Which model performs best on your out-of-sample data, the validate set?

In [26]:
#create the object
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [27]:
dot_data = export_graphviz(clf, feature_names= X_validate.columns, class_names=('survived', 'died'), rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [28]:
#finding model score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_validate, y_validate)))

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [29]:
#clf.predict(X_train) uses the model to create a prediction for each row in the X_train dataframe and each prediction is
# based on the specific features and their values in that given row...one prediction for each row in that dataframe
y_pred = clf.predict(X_validate)
y_pred[0:5]

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.