In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from prepare import prep_titanic, prep_telco, train_val_test

In [2]:
accuracy = '(TP + TN) /	(TP + TN + FP + FN)'
recall = 'TP / (TP + FN)'
true_positive_rate = 'TP / (TP + FN)'
false_positive_rate = 'FP / (FP + TN)'
true_negative_rate = 'TN / (TN + FP)'
false_negative_rate = 'FN / (FN + TP)'
precision = 'TP / (TP + FP)'
f1_score = '2 * (precision * recall) / (precision + recall)'
support = 'TP + FN'

# Create a pandas DataFrame for the metrics
data = {
    'Metric': ['Accuracy', 'Recall', 'True positive Rate' ,'False Positive Rate', 'True Negative Rate', 'False Negative Rate', 'Precision', 'F1-Score', 'Support'],
    'Value': [accuracy, recall, true_positive_rate ,false_positive_rate, true_negative_rate, false_negative_rate, precision, f1_score, support]
}

metrics = pd.DataFrame(data)

In [3]:
data = {
    '': ['Actual Positive', 'Actual Negative'] ,
    'Predicted Positive': ['True Positive (TP)', 'False Positive (FP)'],
    'Predicted Negative': ['False Negative (FN)', 'True Negative (TN)'], 
}

rubric = pd.DataFrame(data)
rubric.set_index('', inplace=True)

# Decision Tree

1) What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

### Titanic

In [4]:
#loading the titanic dataframe

df = prep_titanic()
df.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,Southampton,0
1,1,female,38.0,1,0,71.2833,First,Cherbourg,0
2,1,female,26.0,0,0,7.925,Third,Southampton,1
3,1,female,35.0,1,0,53.1,First,Southampton,0
4,0,male,35.0,0,0,8.05,Third,Southampton,1


In [5]:
# splitting the titanic dataframe into training, validation, and test sets.

train, val, test = train_val_test(df, strat = 'survived')

In [6]:
train.head()

Unnamed: 0,survived,sex,age,sibsp,parch,fare,class,embark_town,alone
748,0,male,19.0,1,0,53.1,First,Southampton,0
45,0,male,28.0,0,0,8.05,Third,Southampton,1
28,1,female,28.0,0,0,7.8792,Third,Queenstown,1
633,0,male,28.0,0,0,0.0,First,Southampton,1
403,0,male,28.0,1,0,15.85,Third,Southampton,0


In [7]:
#encoding the sex, class and embark categorical columns in the train set.

train = pd.get_dummies(train, columns = ['sex', 'class', 'embark_town'])
train = train.drop(columns = 'sex_male')
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_female,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,0,19.0,1,0,53.1,0,False,True,False,False,False,False,True
45,0,28.0,0,0,8.05,1,False,False,False,True,False,False,True
28,1,28.0,0,0,7.8792,1,True,False,False,True,False,True,False
633,0,28.0,0,0,0.0,1,False,True,False,False,False,False,True
403,0,28.0,1,0,15.85,0,False,False,False,True,False,False,True


In [8]:
#encoding the sex, class and embark categorical columns in the val set

val = pd.get_dummies(val, columns = ['sex', 'class', 'embark_town'])
val = val.drop(columns = 'sex_male')
val.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_female,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
377,0,27.0,0,2,211.5,0,False,True,False,False,True,False,False
244,0,30.0,0,0,7.225,1,False,False,False,True,True,False,False
72,0,21.0,0,0,73.5,1,False,False,True,False,False,False,True
815,0,28.0,0,0,0.0,1,False,True,False,False,False,False,True
841,0,16.0,0,0,10.5,1,False,False,True,False,False,False,True


In [9]:
#making sure train and val all have the same number of columns

train.shape, val.shape 

((623, 13), (134, 13))

In [10]:
# creating an X & y version of train and val, where y is a series with just the target variable and X are all the features.#

X_train = train.drop(columns = ['survived'])
y_train = train.survived

X_val = val.drop(columns = ['survived'])
y_val = val.survived

In [11]:
# creating a baseline of the most occuring number in the survived column

y_train.mode()

0    0
Name: survived, dtype: int64

In [12]:
base_acc = (y_train == 0).mean()
base_acc

0.6163723916532905

baseline accuracy is 61%.

2) Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [13]:
#verifying that x and y train have the same amount of rows

X_train.shape, y_train.shape

((623, 12), (623,))

In [14]:
#creating a model and fitting it to our train set

clf = DecisionTreeClassifier(max_depth = 4, random_state = 42)
clf.fit(X_train, y_train)

In [15]:
clf.feature_importances_

array([0.13486812, 0.01215573, 0.00091563, 0.08339922, 0.        ,
       0.596436  , 0.05193642, 0.        , 0.12028888, 0.        ,
       0.        , 0.        ])

In [16]:
#making predictions on X_train with the .predict() method.

t_pred = clf.predict(X_train)
t_pred[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0])

In [17]:
#making a probability estimation on X_train with the .predict_proba() method

t_prob = clf.predict_proba(X_train)
t_prob[:10]

array([[0.59722222, 0.40277778],
       [0.89438944, 0.10561056],
       [0.16129032, 0.83870968],
       [1.        , 0.        ],
       [0.89438944, 0.10561056],
       [0.        , 1.        ],
       [0.59722222, 0.40277778],
       [0.89438944, 0.10561056],
       [0.59722222, 0.40277778],
       [0.89438944, 0.10561056]])

In [18]:
#evaluating the accuracy of the train model

clf.score(X_train, y_train)

0.8459069020866774

The accuracy on the train model is 98%

3) Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [19]:
#using the .score() method to predict the y_train

clf.score(X_train, y_train)

0.8459069020866774

In [20]:
#creating a crosstab for the val confusion matrix

v_pred = clf.predict(X_val)
val_ct = pd.crosstab(y_val, v_pred)
val_ct

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,79,3
1,19,33


In [21]:
#creating a crosstab for the train confusion matrix

train_ct = pd.crosstab(y_train, t_pred)
train_ct

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,371,13
1,83,156


In [22]:
371 / (371 + 13)

0.9661458333333334

#confusion matrix

accuracy = (371 + 156) / (371 + 156 + 83 + 13)  
ans=> 85

precision = 371 / (371 + 83)  
ans=> 82

recall = 371 / (371 + 13)  
ans=> 97

In [23]:
#classification report

print(classification_report(y_train, t_pred))

              precision    recall  f1-score   support

           0       0.82      0.97      0.89       384
           1       0.92      0.65      0.76       239

    accuracy                           0.85       623
   macro avg       0.87      0.81      0.83       623
weighted avg       0.86      0.85      0.84       623



4) Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [24]:
#creating a crosstab for the metrics below

train_ct

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,371,13
1,83,156


#### train

In [25]:
metrics

Unnamed: 0,Metric,Value
0,Accuracy,(TP + TN) /\t(TP + TN + FP + FN)
1,Recall,TP / (TP + FN)
2,True positive Rate,TP / (TP + FN)
3,False Positive Rate,FP / (FP + TN)
4,True Negative Rate,TN / (TN + FP)
5,False Negative Rate,FN / (FN + TP)
6,Precision,TP / (TP + FP)
7,F1-Score,2 * (precision * recall) / (precision + recall)
8,Support,TP + FN


In [26]:
rubric

Unnamed: 0,Predicted Positive,Predicted Negative
,,
Actual Positive,True Positive (TP),False Negative (FN)
Actual Negative,False Positive (FP),True Negative (TN)


In [27]:
# accuracy

t_acc = (371 + 156) / (371 + 156 + 83 + 13)
t_acc

0.8459069020866774

In [28]:
#true positive rate

t_tpr = 371 / (371 + 13)
t_tpr

0.9661458333333334

In [29]:
#false positive rate

fpr = 83 / (83 + 156)
fpr

0.3472803347280335

In [30]:
#true negative rate

tnr = 156 / (156 + 83)
tnr

0.6527196652719666

In [31]:
#false negative rate

fnr = 13 / (13 + 371)
fnr

0.033854166666666664

In [32]:
#precision

pre = 371 / (371 + 83)
pre

0.8171806167400881

In [33]:
#recall

rec = 371 / (371 + 83)
rec

0.8171806167400881

In [34]:
#F1 score

F1 = 2 * (pre * rec) / (pre + rec)
F1

0.817180616740088

In [35]:
#support

371 + 13

384

5) Run through steps 2-4 using a different max_depth value.

In [36]:
seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(2, 20):

    clf = DecisionTreeClassifier(max_depth = i, random_state = seed)

    clf.fit(X_train, y_train)

    train_acc.append(clf.score(X_train, y_train))

    val_acc.append(clf.score(X_val, y_val))

    depth.append(i)

In [37]:
pd.DataFrame({'max_depth' : depth, 'train_acc' : train_acc, 'val_acc' : val_acc})

Unnamed: 0,max_depth,train_acc,val_acc
0,2,0.807384,0.798507
1,3,0.829856,0.835821
2,4,0.845907,0.835821
3,5,0.863563,0.820896
4,6,0.879615,0.820896
5,7,0.894061,0.798507
6,8,0.913323,0.798507
7,9,0.922953,0.813433
8,10,0.934189,0.798507
9,11,0.948636,0.80597


In [38]:
seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(2, 10):

    clf = DecisionTreeClassifier(max_depth = i, random_state = seed)

    clf.fit(X_train, y_train)

    train_acc.append(clf.score(X_train, y_train))

    val_acc.append(clf.score(X_val, y_val))

    depth.append(i)

In [39]:
pd.DataFrame({'max_depth' : depth, 'train_acc' : train_acc, 'val_acc' : val_acc})

Unnamed: 0,max_depth,train_acc,val_acc
0,2,0.807384,0.798507
1,3,0.829856,0.835821
2,4,0.845907,0.835821
3,5,0.863563,0.820896
4,6,0.879615,0.820896
5,7,0.894061,0.798507
6,8,0.913323,0.798507
7,9,0.922953,0.813433


6) Which model performs better on your in-sample data?

Recall model works best for the in-sample data!

7) Which model performs best on your out-of-sample data, the validate set?

In [40]:
val_ct

col_0,0,1
survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,79,3
1,19,33


#### validate

In [41]:
rubric

Unnamed: 0,Predicted Positive,Predicted Negative
,,
Actual Positive,True Positive (TP),False Negative (FN)
Actual Negative,False Positive (FP),True Negative (TN)


In [42]:
metrics

Unnamed: 0,Metric,Value
0,Accuracy,(TP + TN) /\t(TP + TN + FP + FN)
1,Recall,TP / (TP + FN)
2,True positive Rate,TP / (TP + FN)
3,False Positive Rate,FP / (FP + TN)
4,True Negative Rate,TN / (TN + FP)
5,False Negative Rate,FN / (FN + TP)
6,Precision,TP / (TP + FP)
7,F1-Score,2 * (precision * recall) / (precision + recall)
8,Support,TP + FN


In [43]:
#accuracy

v_acc = (687 + 158) / (687 + 158 + 122 + 89)
v_acc

0.8001893939393939

In [44]:
#precision

v_pre = 687 / (687 + 122)
v_pre

0.8491965389369592

In [45]:
#recall

v_rec = 687 / (687 + 89)
v_rec

0.8853092783505154

Recall works best for the out of sample data!

# Telco

1) What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [46]:
telco_df = prep_telco()
telco_df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [47]:
train, val, test = train_val_test(telco_df, strat = 'churn')

In [48]:
#encoding the categorical columns in the train set.

train = pd.get_dummies(train, columns = ['gender', 'contract_type', 'internet_service_type', 'payment_type',
                                        'partner', 'dependents', 'phone_service', 'multiple_lines',
                                         'online_security', 'online_backup', 'device_protection', 'churn',
                                         'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing'],
                       drop_first = True)

train.columns = train.columns.str.replace(' ', '_')
train = train.rename(columns = {'churn_Yes' : 'churn'} )
train.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,contract_type_One_year,contract_type_Two_year,internet_service_type_Fiber_optic,payment_type_Credit_card_(automatic),payment_type_Electronic_check,...,device_protection_No_internet_service,device_protection_Yes,churn,tech_support_No_internet_service,tech_support_Yes,streaming_tv_No_internet_service,streaming_tv_Yes,streaming_movies_No_internet_service,streaming_movies_Yes,paperless_billing_Yes
5609,0,14,76.45,1117.55,True,False,False,True,False,True,...,False,True,False,False,False,False,False,False,False,False
2209,0,5,70.0,347.4,True,True,False,False,False,False,...,False,True,True,False,False,False,True,False,True,True
6919,0,35,75.2,2576.2,True,False,False,True,False,True,...,False,False,True,False,False,False,False,False,False,True
2284,0,58,86.1,4890.5,True,False,True,False,False,True,...,False,True,False,False,False,False,True,False,True,True
845,0,2,49.6,114.7,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True


In [49]:
#encoding the categorical columns in the val set.

val = pd.get_dummies(val, columns = ['gender', 'contract_type', 'internet_service_type', 'payment_type',
                                        'partner', 'dependents', 'phone_service', 'multiple_lines',
                                         'online_security', 'online_backup', 'device_protection', 'churn',
                                         'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing'], 
                     drop_first = True)

val.columns = val.columns.str.replace(' ', '_')
val = val.rename(columns = {'churn_Yes' : 'churn'} )
val.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,contract_type_One_year,contract_type_Two_year,internet_service_type_Fiber_optic,payment_type_Credit_card_(automatic),payment_type_Electronic_check,...,device_protection_No_internet_service,device_protection_Yes,churn,tech_support_No_internet_service,tech_support_Yes,streaming_tv_No_internet_service,streaming_tv_Yes,streaming_movies_No_internet_service,streaming_movies_Yes,paperless_billing_Yes
6910,0,46,20.2,845.6,True,True,False,False,False,False,...,True,False,False,True,False,True,False,True,False,True
6044,0,40,106.0,4178.65,True,True,False,True,False,False,...,False,True,False,False,False,False,True,False,True,True
2153,0,53,25.55,1336.1,True,False,True,False,True,False,...,True,False,False,True,False,True,False,True,False,False
2089,0,70,40.05,2799.75,True,False,True,False,False,False,...,False,False,False,False,True,False,False,False,True,True
6393,0,3,96.6,291.9,False,False,False,True,False,True,...,False,False,True,False,False,False,True,False,True,True


In [50]:
# Convert boolean columns to numeric
bool_columns = ['gender_Male', 'contract_type_One_year', 'contract_type_Two_year', 'internet_service_type_Fiber_optic',
                'payment_type_Credit_card_(automatic)', 'payment_type_Electronic_check',
                'device_protection_No_internet_service', 'device_protection_Yes',
                'tech_support_No_internet_service', 'tech_support_Yes',
                'streaming_tv_No_internet_service', 'streaming_tv_Yes',
                'streaming_movies_No_internet_service', 'streaming_movies_Yes',
                'paperless_billing_Yes']

train[bool_columns] = train[bool_columns].astype(int)

# Now you can proceed with creating and fitting the model
train.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,contract_type_One_year,contract_type_Two_year,internet_service_type_Fiber_optic,payment_type_Credit_card_(automatic),payment_type_Electronic_check,...,device_protection_No_internet_service,device_protection_Yes,churn,tech_support_No_internet_service,tech_support_Yes,streaming_tv_No_internet_service,streaming_tv_Yes,streaming_movies_No_internet_service,streaming_movies_Yes,paperless_billing_Yes
5609,0,14,76.45,1117.55,1,0,0,1,0,1,...,0,1,False,0,0,0,0,0,0,0
2209,0,5,70.0,347.4,1,1,0,0,0,0,...,0,1,True,0,0,0,1,0,1,1
6919,0,35,75.2,2576.2,1,0,0,1,0,1,...,0,0,True,0,0,0,0,0,0,1
2284,0,58,86.1,4890.5,1,0,1,0,0,1,...,0,1,False,0,0,0,1,0,1,1
845,0,2,49.6,114.7,0,0,0,0,0,0,...,0,0,True,0,0,0,0,0,0,1


In [51]:
# Convert boolean columns to numeric
bool_columns = ['gender_Male', 'contract_type_One_year', 'contract_type_Two_year', 'internet_service_type_Fiber_optic',
                'payment_type_Credit_card_(automatic)', 'payment_type_Electronic_check',
                'device_protection_No_internet_service', 'device_protection_Yes',
                'tech_support_No_internet_service', 'tech_support_Yes',
                'streaming_tv_No_internet_service', 'streaming_tv_Yes',
                'streaming_movies_No_internet_service', 'streaming_movies_Yes',
                'paperless_billing_Yes']

val[bool_columns] = val[bool_columns].astype(int)

# Now you can proceed with creating and fitting the model
val.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,gender_Male,contract_type_One_year,contract_type_Two_year,internet_service_type_Fiber_optic,payment_type_Credit_card_(automatic),payment_type_Electronic_check,...,device_protection_No_internet_service,device_protection_Yes,churn,tech_support_No_internet_service,tech_support_Yes,streaming_tv_No_internet_service,streaming_tv_Yes,streaming_movies_No_internet_service,streaming_movies_Yes,paperless_billing_Yes
6910,0,46,20.2,845.6,1,1,0,0,0,0,...,1,0,False,1,0,1,0,1,0,1
6044,0,40,106.0,4178.65,1,1,0,1,0,0,...,0,1,False,0,0,0,1,0,1,1
2153,0,53,25.55,1336.1,1,0,1,0,1,0,...,1,0,False,1,0,1,0,1,0,0
2089,0,70,40.05,2799.75,1,0,1,0,0,0,...,0,0,False,0,1,0,0,0,1,1
6393,0,3,96.6,291.9,0,0,0,1,0,1,...,0,0,True,0,0,0,1,0,1,1


In [52]:
train['total_charges'] = np.where(train['total_charges'].str.strip() == '', 0, train['total_charges'])

In [53]:
val['total_charges'] = np.where(val['total_charges'].str.strip() == '', 0, val['total_charges'])

In [54]:
#making sure train and val all have the same number of columns

train.shape, val.shape 

((4930, 30), (1056, 30))

In [55]:
# creating an X & y version of train and val, where y is a series with just the target variable and X are all the features.

X_train = train.drop(columns = ['churn'])
y_train = train.churn

X_val = val.drop(columns = ['churn'])
y_val = val.churn

In [56]:
# creating a baseline of the most occuring number in the survived column

y_train.mode()

0    False
Name: churn, dtype: bool

In [57]:
base_acc = (y_train == 0).mean()
base_acc

0.734685598377282

baseline accuracy is 73%.

2) Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [58]:
#verifying that x and y train have the same amount of rows

X_train.shape, y_train.shape

((4930, 29), (4930,))

In [59]:
clf = DecisionTreeClassifier(max_depth  = 4, random_state = 42)
clf.fit(X_train, y_train)

In [60]:
clf.feature_importances_

array([0.0056141 , 0.4988502 , 0.01293116, 0.02996284, 0.        ,
       0.02930254, 0.00877762, 0.35879748, 0.        , 0.01212779,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.00639083, 0.        ,
       0.        , 0.03724543, 0.        , 0.        ])

In [61]:
#making predictions on X_train with the .predict() method.

t_pred = clf.predict(X_train)
t_pred[:10]

array([ True,  True, False, False,  True, False, False, False, False,
       False])

In [62]:
#making a probability estimation on X_train with the .predict_proba() method

t_prob = clf.predict_proba(X_train)
t_prob[:10]

array([[0.46616541, 0.53383459],
       [0.48275862, 0.51724138],
       [0.57404326, 0.42595674],
       [0.92857143, 0.07142857],
       [0.48275862, 0.51724138],
       [0.81724846, 0.18275154],
       [0.57404326, 0.42595674],
       [0.87403599, 0.12596401],
       [0.98803191, 0.01196809],
       [0.98803191, 0.01196809]])

In [63]:
#evaluating the accuracy of the train model

clf.score(X_train, y_train)

0.7939148073022312

In [64]:
clf.score(X_val, y_val)

0.8001893939393939

3) Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [65]:
#creating a crosstab for the val confusion matrix

v_pred = clf.predict(X_val)
val_ct = pd.crosstab(y_val, v_pred)
val_ct

col_0,False,True
churn,Unnamed: 1_level_1,Unnamed: 2_level_1
False,687,89
True,122,158


In [66]:
#creating a crosstab for the train confusion matrix

train_ct = pd.crosstab(y_train, t_pred)
train_ct

col_0,False,True
churn,Unnamed: 1_level_1,Unnamed: 2_level_1
False,3218,404
True,612,696


#confusion matrix

accuracy = (3621 + 1298) / (3621 + 1298 + 10 + 1)  
ans=> 99.8%

precision = 3621 / (3621 + 10)  
ans=> 99.7%

recall = 3621 / (3621 + 1)  
ans=> 99.9%

In [67]:
3621 / (3621 + 1)

0.999723909442297

In [68]:
#classification report

print(classification_report(y_train, t_pred))

              precision    recall  f1-score   support

       False       0.84      0.89      0.86      3622
        True       0.63      0.53      0.58      1308

    accuracy                           0.79      4930
   macro avg       0.74      0.71      0.72      4930
weighted avg       0.79      0.79      0.79      4930



5) Run through steps 2-4 using a different max_depth value.

In [69]:
seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(2, 8):

    clf = DecisionTreeClassifier(max_depth = i, random_state = seed)

    clf.fit(X_train, y_train)

    train_acc.append(clf.score(X_train, y_train))

    val_acc.append(clf.score(X_val, y_val))

    depth.append(i)

In [70]:
pd.DataFrame({'max_depth' : depth, 'train_acc' : train_acc, 'val_acc' : val_acc})

Unnamed: 0,max_depth,train_acc,val_acc
0,2,0.791278,0.797348
1,3,0.791278,0.797348
2,4,0.793915,0.800189
3,5,0.801014,0.803977
4,6,0.815416,0.801136
5,7,0.829412,0.786932


In [71]:
seed = 42
train_acc = []
val_acc = []
depth = []

for i in range(2, 15):

    clf = DecisionTreeClassifier(max_depth = i, random_state = seed)

    clf.fit(X_train, y_train)

    train_acc.append(clf.score(X_train, y_train))

    val_acc.append(clf.score(X_val, y_val))

    depth.append(i)

In [72]:
pd.DataFrame({'max_depth' : depth, 'train_acc' : train_acc, 'val_acc' : val_acc})

Unnamed: 0,max_depth,train_acc,val_acc
0,2,0.791278,0.797348
1,3,0.791278,0.797348
2,4,0.793915,0.800189
3,5,0.801014,0.803977
4,6,0.815416,0.801136
5,7,0.829412,0.786932
6,8,0.839757,0.784091
7,9,0.859432,0.772727
8,10,0.882759,0.761364
9,11,0.902434,0.760417
