### Decision Tree Exercises

In [59]:
# imports
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [60]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [61]:
# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [62]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)
class_dummy = pd.get_dummies(df[["class"]], drop_first=True)

In [63]:
# putting the three dataframes together
df = pd.concat([df, embarked_dummy, class_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked', 'class', 'pclass'])
df.isna().sum()

passenger_id      0
survived          0
age             177
sibsp             0
parch             0
fare              0
alone             0
is_female         0
embarked_Q        0
embarked_S        0
class_Second      0
class_Third       0
dtype: int64

In [64]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")
print(train.survived.value_counts())
train.head()

0    307
1    191
Name: survived, dtype: int64


Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S,class_Second,class_Third
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,False,0,1,1,0
306,306,1,,0,0,110.8833,1,False,0,0,0,0


In [65]:
#Calculate our fill value using train dataset only.
avg_age = train.age.mean()

# Fill null values in all of our datasets using our hardcoded value.
train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

In [66]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### 1. What is your baseline prediction? What is your baseline accuracy?

In [67]:
# needs help
train["most_frequent"] = "0"
baseline_accuracy = (train.survived == train.most_frequent).mean()
baseline_accuracy

0.0

#### 2. Fit the decision tree classifier to your training sample and transform

In [68]:
# generating a blank, new Decision Tree model
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

# training the model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [30]:
# Visualize the model so iut can explain itself!
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [69]:
# making a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([1, 0, 0])

In [70]:
# evaluating the model with .score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.77


In [72]:
# confusion matrix for model --------------------- totally confused
y_true = y_train
confusion_matrix(y_true, y_pred,
                 labels = [0, 1])

array([[265,  42],
       [ 73, 118]])

In [73]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82       307
           1       0.74      0.62      0.67       191

    accuracy                           0.77       498
   macro avg       0.76      0.74      0.75       498
weighted avg       0.77      0.77      0.76       498



In [74]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf.score(X_train, y_train)

0.7690763052208835

In [36]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_train)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82       307
           1       0.74      0.62      0.67       191

    accuracy                           0.77       498
   macro avg       0.76      0.74      0.75       498
weighted avg       0.77      0.77      0.76       498



#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

In [75]:
# did not survive = positive class
# survived = negative class

# False positive: We predicted did not survive, but it is actually survived
# False negative: We predicted survived, but it is actually did not survive

tp = 118
tn = 265
fp = 42
fn = 73

print("Classifier (where 'did not survive' is the positive prediction)")
print("True Positives", tp)
print("False Positives", fp)
print("False Negatives", fn)
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", round(recall,2))
print("Precision is", round(precision,2))

Classifier (where 'did not survive' is the positive prediction)
True Positives 118
False Positives 42
False Negatives 73
True Negatives 265
-------------
Accuracy is 0.7690763052208835
Recall is 0.62
Precision is 0.74


#### 5. Run through steps 2-4 using a different max_depth value.

In [78]:
# generating a blank, new Decision Tree model
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

# training the model on the training data
# fitting == training the model
clf = clf.fit(X_train, y_train)
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [47]:
# Visualize the model so iut can explain itself!
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [79]:
# making a set of predictions using this trained model
y_pred = clf.predict(X_train)
y_pred[0:3]

array([1, 0, 0])

In [80]:
# evaluating the model with .score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [81]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf.score(X_validate, y_validate)

0.6915887850467289

In [82]:
# Let's evaluate this model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.3f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.692


In [83]:
# Use the classification model trained on train data to make predictions on validate data
y_pred = clf.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.82      0.77       132
           1       0.62      0.49      0.55        82

    accuracy                           0.69       214
   macro avg       0.67      0.65      0.66       214
weighted avg       0.68      0.69      0.68       214



#### 6. Which model performs better on your in-sample data?

#### 7. Which model performs best on your out-of-sample data, the validate set?

### Random Forest Exercises

#### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [None]:
# additional import for random forest
from sklearn.ensemble import RandomForestClassifier



#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

#### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

#### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

#### After making a few models, which one has the best performance (or closest metrics) on both train and validate?