### Decision Tree Exercises

In [123]:
# imports
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import graphviz
from graphviz import Graph

In [124]:
def split(df, stratify_by=None):
    """
    Crude train, validate, test split
    To stratify, send in a column name for the stratify_by argument
    """

    if stratify_by == None:
        train, test = train_test_split(df, test_size=.2, random_state=123)
        train, validate = train_test_split(train, test_size=.3, random_state=123)
    else:
        train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
        train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])

    return train, validate, test

In [125]:
# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [126]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)
class_dummy = pd.get_dummies(df[["class"]], drop_first=True)

In [127]:
# putting the three dataframes together
df = pd.concat([df, embarked_dummy, class_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked', 'class', 'pclass'])
df.isna().sum()

passenger_id      0
survived          0
age             177
sibsp             0
parch             0
fare              0
alone             0
is_female         0
embarked_Q        0
embarked_S        0
class_Second      0
class_Third       0
dtype: int64

In [128]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")
print(train.survived.value_counts())

train

0    307
1    191
Name: survived, dtype: int64


Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S,class_Second,class_Third
583,583,0,36.0,0,0,40.1250,1,False,0,0,0,0
165,165,1,9.0,0,2,20.5250,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0000,0,False,0,1,1,0
306,306,1,,0,0,110.8833,1,False,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
313,313,0,28.0,0,0,7.8958,1,False,0,1,0,1
636,636,0,32.0,0,0,7.9250,1,False,0,1,0,1
222,222,0,51.0,0,0,8.0500,1,False,0,1,0,1
485,485,0,,3,1,25.4667,0,False,0,1,0,1


In [129]:
# Calculate our fill value using train dataset only.
avg_age = train.age.mean()

# Fill null values in age column using our hardcoded value.
train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

In [130]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### 1. What is your baseline prediction? What is your baseline accuracy?

In [131]:
# needs help
train["most_frequent"] = 0
baseline_accuracy = (train.survived == train.most_frequent).mean()
baseline_accuracy

0.6164658634538153

#### 2. Fit the decision tree classifier to your training sample and transform

In [148]:
# generating a blank, new Decision Tree model
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

# training the first model on the training data
# fitting == training the model
clf1 = clf.fit(X_train, y_train)
clf1

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [149]:
# visualizing the model
dot_data = export_graphviz(clf1, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

#### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [150]:
# making a set of predictions using this trained model
y_pred = clf1.predict(X_train)
y_pred[0:3]

array([1, 0, 0])

In [151]:
# evaluating the model with .score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf1.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.74


In [152]:
# confusion matrix for model
y_true = y_train
confusion_matrix(y_true, y_pred,
                 labels = [0, 1])

array([[262,  45],
       [ 86, 105]])

In [153]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       307
           1       0.70      0.55      0.62       191

    accuracy                           0.74       498
   macro avg       0.73      0.70      0.71       498
weighted avg       0.73      0.74      0.73       498



In [154]:
# clf was trained on X_train, y_train
# To evaluate the model trained on new data, the arguments coming into .score()
clf1.score(X_train, y_train)

0.7369477911646586

#### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support

In [155]:
# did not survive = positive class
# survived = negative class

# manually adding the scores and printing 
tp = 118
tn = 265
fp = 42
fn = 73

print("Survived classifier (where 'did not survive' is the positive prediction)")
print("True Positives", tp)
print("False Positives", fp) # False positive: We predicted did not survive, but it is actually survived
print("False Negatives", fn) # False negative: We predicted survived, but it is actually did not survive
print("True Negatives", tn)

print("-------------")

accuracy = (tp + tn) / (tp + tn + fp + fn)
recall = tp / (tp + fn)
precision = tp / (tp + fp)

print("Accuracy is", accuracy)
print("Recall is", round(recall,2))
print("Precision is", round(precision,2))

Survived classifier (where 'did not survive' is the positive prediction)
True Positives 118
False Positives 42
False Negatives 73
True Negatives 265
-------------
Accuracy is 0.7690763052208835
Recall is 0.62
Precision is 0.74


#### 5. Comparison: Run through steps 2-4 using a different max_depth value.

In [156]:
# generating a blank, new Decision Tree model
clf = DecisionTreeClassifier(max_depth=5, random_state=123)

# training the model on the training data
# fitting == training the model
clf2 = clf.fit(X_train, y_train)
clf2

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=123, splitter='best')

In [157]:
# visualizing the second model
dot_data = export_graphviz(clf2, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True, format="pdf")

'titanic_decision_tree.pdf'

In [163]:
# making a set of predictions using this second trained model
y_pred = clf2.predict(X_train)
y_pred[0:3]

array([1, 0, 0])

In [164]:
# evaluating the second model with .score
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf2.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [166]:
# confusion matrix for second model
y_true = y_train
confusion_matrix(y_true, y_pred,
                 labels = [0, 1])

array([[271,  36],
       [ 69, 122]])

In [167]:
# classification report for second model
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.88      0.84       307
           1       0.77      0.64      0.70       191

    accuracy                           0.79       498
   macro avg       0.78      0.76      0.77       498
weighted avg       0.79      0.79      0.78       498



In [160]:
# clf was trained on X_train, y_train
# To evaluate the second model trained on new data, validate the arguments coming into .score()
clf2.score(X_validate, y_validate)

0.6915887850467289

In [161]:
# Let's evaluate the second model on out-of-sample data
print('Accuracy of Decision Tree classifier on validate set: {:.3f}'
     .format(clf2.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.692


#### 6. Which model performs better on your in-sample data?

#### 7. Which model performs best on your out-of-sample data, the validate set?

In [162]:
# first classification model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf1.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       132
           1       0.66      0.46      0.54        82

    accuracy                           0.70       214
   macro avg       0.69      0.66      0.66       214
weighted avg       0.69      0.70      0.69       214



In [168]:
#second classification model

# Use the classification model trained on train data to make predictions on validate data
y_pred = clf2.predict(X_validate)

# Compare actual y values from validate to predictions based on X_validate
print(classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.82      0.77       132
           1       0.62      0.49      0.55        82

    accuracy                           0.69       214
   macro avg       0.67      0.65      0.66       214
weighted avg       0.68      0.69      0.68       214



** Takeaways **
The second decision tree model in which the max_depth was increased by 2 fared slightly worse in most categories.

### Random Forest Exercises

#### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.

In [26]:
# additional import for random forest
from sklearn.ensemble import RandomForestClassifier

# getting the data
df = pd.read_csv('titanic_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [27]:
# cleaning up my column issues
df["is_female"] = df.sex == "Female"

df = df.drop(columns=['Unnamed: 0', 'deck', 'embark_town'])

# We'll want to encode the day variable, since there are 3 possibilities
embarked_dummy = pd.get_dummies(df[["embarked"]], drop_first=True)
class_dummy = pd.get_dummies(df[["class"]], drop_first=True)

In [28]:
# putting the three dataframes together
df = pd.concat([df, embarked_dummy, class_dummy], axis=1)

# drop the old columns
df = df.drop(columns=["sex", 'embarked', 'class', 'pclass'])
df.isna().sum()

passenger_id      0
survived          0
age             177
sibsp             0
parch             0
fare              0
alone             0
is_female         0
embarked_Q        0
embarked_S        0
class_Second      0
class_Third       0
dtype: int64

In [29]:
# Split the data
# stratifying means we're making representative datasets between train, validate, test
train, validate, test = split(df, stratify_by="survived")
print(train.survived.value_counts())
train.head()

0    307
1    191
Name: survived, dtype: int64


Unnamed: 0,passenger_id,survived,age,sibsp,parch,fare,alone,is_female,embarked_Q,embarked_S,class_Second,class_Third
583,583,0,36.0,0,0,40.125,1,False,0,0,0,0
165,165,1,9.0,0,2,20.525,0,False,0,1,0,1
50,50,0,7.0,4,1,39.6875,0,False,0,1,0,1
259,259,1,50.0,0,1,26.0,0,False,0,1,1,0
306,306,1,,0,0,110.8833,1,False,0,0,0,0


In [30]:
# Calculate our fill value using train dataset only.
avg_age = train.age.mean()

# Fill null values in all of our datasets using our hardcoded value.
train.age = train.age.fillna(avg_age)

validate.age = validate.age.fillna(avg_age)

test.age = test.age.fillna(avg_age)

In [31]:
# Setup our X inputs and y target variable for each split
X_train = train.drop(columns=['survived'])
y_train = train.survived # labeled data == supervise algorithm

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [32]:
# running random forest
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [33]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [36]:
print(rf.feature_importances_)

[0.24605563 0.21915032 0.04980628 0.04156951 0.27117085 0.02411287
 0.         0.02102327 0.03148504 0.02188514 0.07374109]


In [37]:
y_pred = rf.predict(X_train)

In [41]:
y_pred_proba = rf.predict_proba(X_train)

#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [42]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [43]:
print(confusion_matrix(y_train, y_pred,
                      labels = [0, 1]))

[[307   0]
 [ 17 174]]


In [44]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       1.00      0.91      0.95       191

    accuracy                           0.97       498
   macro avg       0.97      0.96      0.96       498
weighted avg       0.97      0.97      0.97       498



#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [58]:
# create a function to calculate these metrics
def get_metrics_binary(rf):
    '''
    get_metrics_binary takes in a confusion matrix (cnf) for a binary classifier and prints out metrics based on
    values in variables named X_train, y_train, and y_pred.
    
    return: a classification report as a transposed DataFrame
    '''
    accuracy = rf.score(X_train, y_train)
    class_report = pd.DataFrame(classification_report(y_train, y_pred, output_dict=True)).T
    conf = confusion_matrix(y_train, y_pred)
    tpr = conf[1][1] / conf[1].sum()
    fpr = conf[0][1] / conf[0].sum()
    tnr = conf[0][0] / conf[0].sum()
    fnr = conf[1][0] / conf[1].sum()
    print(f'''
    The accuracy for our model is {accuracy:.4}
    The True Positive Rate is {tpr:.3}, The False Positive Rate is {fpr:.3},
    The True Negative Rate is {tnr:.3}, and the False Negative Rate is {fnr:.3}
    ''')
    return class_report

In [59]:
# call our function
report_df = get_metrics_binary(rf)


    The accuracy for our model is 0.9659
    The True Positive Rate is 0.691, The False Positive Rate is 0.0684,
    The True Negative Rate is 0.932, and the False Negative Rate is 0.309
    


#### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.

In [50]:
# rerunning random forest
rf1 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=7, 
                            random_state=123)

In [51]:
rf1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [52]:
y_pred = rf1.predict(X_train)

In [53]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf1.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.84


In [54]:
print(confusion_matrix(y_train, y_pred,
                      labels = [0, 1]))

[[286  21]
 [ 59 132]]


In [55]:
# classification report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       307
           1       0.86      0.69      0.77       191

    accuracy                           0.84       498
   macro avg       0.85      0.81      0.82       498
weighted avg       0.84      0.84      0.84       498



#### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

In [57]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier1 on training set: {:.2f}'
     .format(rf1.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97
Accuracy of random forest classifier1 on training set: 0.84


#### After making a few models, which one has the best performance (or closest metrics) on both train and validate?