# Model Exercises

## Curiculum Model - Logistic Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt

import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

df = get_titanic_data()
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [2]:
# Handle missing values in the `age` column.
df.dropna(inplace=True)

In [3]:
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127 entries, 123 to 540
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  127 non-null    int64  
 1   age     127 non-null    float64
 2   fare    127 non-null    float64
 3   sibsp   127 non-null    int64  
 4   parch   127 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 6.0 KB


In [4]:
# from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [5]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
# Print the coefficients and intercept of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[1.30411374e-02 8.72240193e-05 1.53779647e-02 5.48610411e-03
  1.65371660e-03]]
Intercept: 
 [0.00655794]


In [7]:
# Estimate whether or not a passenger would survive, using the training data
y_pred = logit.predict(X_train)

In [8]:
# Estimate the probability of a passenger surviving, using the training data
y_pred_proba = logit.predict_proba(X_train)

In [9]:
# Compute the accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.64


In [10]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

[[ 0 46]
 [ 0 81]]


In [11]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        46
           1       0.64      1.00      0.78        81

    accuracy                           0.64       127
   macro avg       0.32      0.50      0.39       127
weighted avg       0.41      0.64      0.50       127



Curiculum model = 64% accuracy

### My Baseline calculation

In [12]:
# split df
tdf = get_titanic_data()
tdf.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [13]:
train, validate, test = prep_titanic(tdf)

In [14]:
print(train.shape, validate.shape, test.shape)

(497, 10) (214, 10) (178, 10)


In [15]:
train.survived.mean()

0.3822937625754527

In [16]:
train.survived.value_counts()

0    307
1    190
Name: survived, dtype: int64

In [17]:
# died is the majority response - requires human intervention, but gives same result as Ryan's
# positive case = died
my_baseline_accuracy = 307/(307+190)
my_baseline_accuracy

0.6177062374245473

In [18]:
# Ryan's method - can be automated to function
train['baseline_prediction'] = 0
pd.crosstab(train.baseline_prediction, train.survived)

survived,0,1
baseline_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
0,307,190


In [19]:
baseline_accuracy = (train.baseline_prediction == train.survived).mean()
baseline_accuracy

0.6177062374245473

Baseline accuracy = 62%

#### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [20]:
# understand the question to mean: create a model that has age, fare, and pclass as only features
logit = LogisticRegression()

In [21]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline_prediction
583,0,1,36.0,0,0,40.125,1,1,0,0,0
337,1,1,41.0,0,0,134.5,1,0,0,0,0
50,0,3,7.0,4,1,39.6875,0,1,0,1,0
218,1,1,32.0,0,0,76.2917,1,0,0,0,0
31,1,1,28.0,1,0,146.5208,0,0,0,0,0


In [22]:
# X_train = train.drop(columns=['low_tip_target'])
# y_train = train.low_tip_target

# X_validate = validate.drop(columns=['low_tip_target'])
# y_validate = validate.low_tip_target

# X_test = test.drop(columns=['low_tip_target'])
# y_test = test.low_tip_target

X_train_afp = train.drop(columns=['baseline_prediction', 'survived', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_afp = train.survived

X_validate_afp = validate.drop(columns=['survived', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_afp = validate.survived

X_test_afp = test.drop(columns=['survived', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_afp = test.survived

KeyError: "['embark_town'] not found in axis"

In [None]:
X_train_afp.head()

In [None]:
y_train_afp.head()

In [None]:
# Now fit to X_train, y_train for the attributes age, fare, pclass only
logit_afp = logit.fit(X_train_afp, y_train_afp)

In [None]:
print(logit_afp.coef_)


print(logit_afp.intercept_)

In [None]:
X_train_afp.columns

In [None]:
# Predict values on X_train.
y_pred_afp = logit_afp.predict(X_train_afp)
y_pred_proba_afp = logit_afp.predict_proba(X_train_afp)

In [None]:
# model age, fare, pclass accuracy
logit_afp.score(X_train_afp, y_train_afp)

In [None]:
# confusion matrix
print(confusion_matrix(y_train_afp, y_pred_afp))

In [None]:
# classification report for Model afp
print(classification_report(y_train_afp, y_pred_afp))

This model using age, fare, and pclass only has a 71% accuracy rating. 
Age in this model was filled using imputed values.  

Accuracy:   
So this model performs better than the 61% baseline

#### 2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.


In [None]:
# understand the question to mean: create a model that has sex, age, fare, and pclass as features
logit = LogisticRegression()

In [None]:
train.head()

In [None]:
# X_train = train.drop(columns=['low_tip_target'])
# y_train = train.low_tip_target

# X_validate = validate.drop(columns=['low_tip_target'])
# y_validate = validate.low_tip_target

# X_test = test.drop(columns=['low_tip_target'])
# y_test = test.low_tip_target

X_train_safp = train.drop(columns=['baseline_prediction', 'survived', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_safp = train.survived

X_validate_safp = validate.drop(columns=['survived', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_safp = validate.survived

X_test_safp = test.drop(columns=['survived', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_safp = test.survived

In [None]:
X_train_safp.head()

In [None]:
# Now fit to X_train, y_train for the attributes age, fare, pclass only
logit_safp = logit.fit(X_train_safp, y_train_safp)

In [None]:
print(logit_safp.coef_)


print(logit_safp.intercept_)

In [None]:
X_train_safp.columns

In [None]:
# Predict values on X_train.
y_pred_safp = logit_safp.predict(X_train_safp)
y_pred_proba_safp = logit_safp.predict_proba(X_train_safp)

In [None]:
# model sex, age, fare, pclass accuracy
logit_safp.score(X_train_safp, y_train_safp)

This model using sex, age, fare, and pclass only has a 79% accuracy rating.  
Age in this model was filled using imputed values.  

Accuracy:   
So this model performs better than the 61% baseline and better than the model without sex which was 71%

#### 3. Try out other combinations of features and models.

In [None]:
logit = LogisticRegression()

In [None]:
train.head()

In [None]:
# Model pclass as only attribute
X_train_p = train.drop(columns=['baseline_prediction', 'survived', 'age', 'fare', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_p = train.survived

X_validate_p = validate.drop(columns=['survived', 'age', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_p = validate.survived

X_test_p = test.drop(columns=['survived', 'age', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_p = test.survived

In [None]:
# verify pclass is only attribute
X_train_p.head()

In [None]:
# Now fit to X_train, y_train for the attribute pclass only
logit_p = logit.fit(X_train_p, y_train_p)

In [None]:
print(logit_p.coef_)
print(logit_p.intercept_)

In [None]:
# Predict values on X_train.
y_pred_p = logit_p.predict(X_train_p)
y_pred_proba_p = logit_p.predict_proba(X_train_p)

In [None]:
# model sex, age, fare, pclass accuracy
logit_p.score(X_train_p, y_train_p)

This model using pclass only has a 68% accuracy rating.  
Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%  

In [None]:
# Model age as only attribute
X_train_a = train.drop(columns=['baseline_prediction', 'survived', 'pclass', 'fare', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_a = train.survived

X_validate_a = validate.drop(columns=['survived', 'pclass', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_a = validate.survived

X_test_a = test.drop(columns=['survived', 'pclass', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_a = test.survived

In [None]:
# verify age is only attribute
X_train_a.head()

In [None]:
# Now fit to X_train, y_train for the attribute age only
logit_a = logit.fit(X_train_a, y_train_a)

In [None]:
print(logit_a.coef_)
print(logit_a.intercept_)

In [None]:
# model age accuracy
logit_a.score(X_train_a, y_train_a)

This model using age only has a 61% accuracy rating. Which matches the baseline.  
Age in this model was filled using imputed values.  

Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%    
Age = 61%  

In [None]:
# Model sex as only attribute
X_train_s = train.drop(columns=['baseline_prediction', 'survived', 'pclass', 'fare', 'age', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_s = train.survived

X_validate_s = validate.drop(columns=['survived', 'pclass', 'fare', 'age',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_s = validate.survived

X_test_s = test.drop(columns=['survived', 'pclass', 'fare', 'age',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_s = test.survived

In [None]:
# verify sex_male is only attribute
X_train_s.head()

In [None]:
# Now fit to X_train, y_train for the attribute sex_male only
logit_s = logit.fit(X_train_s, y_train_s)

In [None]:
print(logit_s.coef_)
print(logit_s.intercept_)

In [None]:
# model sex_male accuracy
logit_s.score(X_train_s, y_train_s)

This model using sex_male only has a 78% accuracy rating.  
Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%    
Age = 61%  
sex_male = 78%  

In [None]:
# Model alone as only attribute
X_train_al = train.drop(columns=['baseline_prediction', 'survived', 'pclass', 'fare', 'age', 'sex_male', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_al = train.survived

X_validate_al = validate.drop(columns=['survived', 'pclass', 'fare', 'age',  'sex_male', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_al = validate.survived

X_test_al = test.drop(columns=['survived', 'pclass', 'fare', 'age',  'sex_male', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_al = test.survived

In [None]:
# verify alone is only attribute
X_train_al.head()

In [None]:
# Now fit to X_train, y_train for the attribute alone only
logit_al = logit.fit(X_train_al, y_train_al)

In [None]:
print(logit_al.coef_)
print(logit_al.intercept_)

In [None]:
# model alone accuracy
logit_al.score(X_train_al, y_train_al)

This model using alone only has a 64% accuracy rating.  
Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%    
Age = 61%  
sex_male = 78%  
alone = 64%  

#### 4. Choose you best model and evaluate it on the test dataset. Is it overfit?

In [None]:
# editing this question to add validate step. Validate on 2 best models = sex_male only and sex, age, fare, pclass

In [None]:
# model sex, age, fare, pclass validate data
print("model_safp\n", logit_safp.score(X_validate_safp, y_validate_safp))

In [None]:
# model sex_male validate accuracy
logit_s.score(X_validate_s, y_validate_s)

Base on perfomance on the validate data, conclude model with sex, age, fare, and pclass performs the best.  
Run that on the test data

In [None]:
# model sex, age, fare, pclass validate data
print("model_safp\n", logit_safp.score(X_test_safp, y_test_safp))

The accuracy for this model is 80% on the test data.

#### 5. Bonus How do different strategies for handling the missing values in the age column affect model performance?

#### 6. Bonus: How do different strategies for encoding sex affect model performance?

#### 7. Bonus: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

C
=
.01
,
.1
,
1
,
10
,
100
,
1000


#### Bonus Bonus: how does scaling the data interact with your choice of C?

## Decission Tree model exercises

In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic


In [24]:
# split df
tdf = get_titanic_data()
tdf.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [25]:
train, validate, test = prep_titanic(tdf)
print(train.shape, validate.shape, test.shape)

(497, 10) (214, 10) (178, 10)


In [26]:
# Baseline accuracy determination would be the same as logistic regression baseline -- correct?
train.survived.value_counts(normalize=True)

0    0.617706
1    0.382294
Name: survived, dtype: float64

In [27]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,36.0,0,0,40.125,1,1,0,0
337,1,1,41.0,0,0,134.5,1,0,0,0
50,0,3,7.0,4,1,39.6875,0,1,0,1
218,1,1,32.0,0,0,76.2917,1,0,0,0
31,1,1,28.0,1,0,146.5208,0,0,0,0


In [28]:
# split X and y
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### 1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [29]:
# create the decission tree object
# per lesson reccomended to use max_depth=3 for 1st model
clf1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [30]:
# fit the model
clf1.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

#### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [31]:
# get predicted y values and probabilities
y_pred1 = clf1.predict(X_train)
y_pred_proba1 = clf1.predict_proba(X_train)

In [32]:
# get accuracy score
clf1.score(X_train, y_train)

0.8189134808853119

In [33]:
# get confusion matrix
confusion_matrix(y_train, y_pred1)

array([[279,  28],
       [ 62, 128]])

In [34]:
# better visual of confusion matrix as dataframe
# 0: died, 1: survived
labels = sorted(y_train.unique())

matrix1 = pd.DataFrame(confusion_matrix(y_train, y_pred1), index=labels, columns=labels)
matrix1

Unnamed: 0,0,1
0,279,28
1,62,128


In [35]:
print(classification_report(y_train, y_pred1))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497



#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [36]:
print("Accuracy=", clf1.score(X_train, y_train))

Accuracy= 0.8189134808853119


In [37]:
print("Confusion Matrix Model 1\nPostive=Died")
matrix1 = matrix1.rename(columns={0: 'Died', 1: 'Survived'})
matrix1 = matrix1.rename(index={0: 'Died', 1: 'Survived'})
matrix1

Confusion Matrix Model 1
Postive=Died


Unnamed: 0,Died,Survived
Died,279,28
Survived,62,128


In [38]:
print("True Positive=", matrix1.Died[0])
print("True Negative=", matrix1.Survived[1])
print("False Positive=", matrix1.Died[1])
print("False Negative=", matrix1.Survived[1])

True Positive= 279
True Negative= 128
False Positive= 62
False Negative= 128


In [39]:
print("Classification Report Model 1")
print(classification_report(y_train, y_pred1))

Classification Report Model 1
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       307
           1       0.82      0.67      0.74       190

    accuracy                           0.82       497
   macro avg       0.82      0.79      0.80       497
weighted avg       0.82      0.82      0.81       497



#### 4. Run through steps 2-4 using a different max_depth value.

In [40]:
# for 2nd model will use max_depth=10
# create the decission tree object
clf2 = DecisionTreeClassifier(max_depth=10, random_state=123)

In [41]:
# fit the model
clf2.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [42]:
# get predicted y values and probabilities
y_pred2 = clf2.predict(X_train)
y_pred_proba2 = clf2.predict_proba(X_train)

In [43]:
# get confusion matrix
model2cm = confusion_matrix(y_train, y_pred2)
model2cm

array([[297,  10],
       [ 22, 168]])

In [44]:
# better visual of confusion matrix as dataframe
# 0: died, 1: survived
labels = sorted(y_train.unique())

matrix2 = pd.DataFrame(confusion_matrix(y_train, y_pred2), index=labels, columns=labels)
print("Confusion Matrix Model 2\nPostive=Died")
matrix2 = matrix2.rename(columns={0: 'Died', 1: 'Survived'})
matrix2 = matrix2.rename(index={0: 'Died', 1: 'Survived'})
matrix2

Confusion Matrix Model 2
Postive=Died


Unnamed: 0,Died,Survived
Died,297,10
Survived,22,168


In [45]:
print("True Positive=", matrix2.Died[0])
print("True Negative=", matrix2.Survived[1])
print("False Positive=", matrix2.Died[1])
print("False Negative=", matrix2.Survived[1])

True Positive= 297
True Negative= 168
False Positive= 22
False Negative= 168


In [46]:
# get accuracy score
print("Model 2\nAccuracy=", clf2.score(X_train, y_train))

Model 2
Accuracy= 0.9356136820925554


In [47]:
print("Classification Report Model 2")
print(classification_report(y_train, y_pred2))

Classification Report Model 2
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       307
           1       0.94      0.88      0.91       190

    accuracy                           0.94       497
   macro avg       0.94      0.93      0.93       497
weighted avg       0.94      0.94      0.94       497



In [48]:
# for 3rd model will use max_depth=1
# create the decission tree object
clf3 = DecisionTreeClassifier(max_depth=1, random_state=123)

In [49]:
# fit the model
clf3.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [50]:
# get predicted y values and probabilities
y_pred3 = clf3.predict(X_train)
y_pred_proba3 = clf3.predict_proba(X_train)

In [51]:
# get confusion matrix
model3cm = confusion_matrix(y_train, y_pred3)
model3cm

array([[265,  42],
       [ 65, 125]])

In [52]:
# better visual of confusion matrix as dataframe
# 0: died, 1: survived
labels = sorted(y_train.unique())

matrix3 = pd.DataFrame(confusion_matrix(y_train, y_pred3), index=labels, columns=labels)
print("Confusion Matrix Model 3\nPostive=Died")
matrix3 = matrix3.rename(columns={0: 'Died', 1: 'Survived'})
matrix3 = matrix3.rename(index={0: 'Died', 1: 'Survived'})
matrix3

Confusion Matrix Model 3
Postive=Died


Unnamed: 0,Died,Survived
Died,265,42
Survived,65,125


In [53]:
# get accuracy score
print("Model 3\nAccuracy=", clf3.score(X_train, y_train))

Model 3
Accuracy= 0.7847082494969819


In [54]:
print("Classification Report Model 3")
print(classification_report(y_train, y_pred3))

Classification Report Model 3
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       307
           1       0.75      0.66      0.70       190

    accuracy                           0.78       497
   macro avg       0.78      0.76      0.77       497
weighted avg       0.78      0.78      0.78       497



#### 5. Which performs better on your in-sample data?

Model 2 with a max_depth=10 has the highest accuracy, however, it is probably overfit.  
Will test Model 1 and 2 both on validate

In [55]:
# Model 1 fit validate
# fit the model
clf1 = clf1.fit(X_validate, y_validate)
# get accuracy score
print("Model 1\nAccuracy=", clf1.score(X_validate, y_validate))

Model 1
Accuracy= 0.8317757009345794


In [58]:
# Model 2 fit validate
# fit the model
clf2v = clf2.fit(X_validate, y_validate)
# get accuracy score
print("Model 2\nAccuracy=", clf2v.score(X_validate, y_validate))

Model 2
Accuracy= 0.9813084112149533


In [59]:
# get predicted y values and probabilities
y_pred2v = clf2v.predict(X_validate)
y_pred_proba2v = clf2v.predict_proba(X_validate)

In [60]:
# get confusion matrix
model2cmv = confusion_matrix(y_validate, y_pred2v)
model2cmv

array([[130,   2],
       [  2,  80]])

In [61]:
# Model 2 fit test
# fit the model
clf2t = clf2.fit(X_test, y_test)
# get accuracy score
print("Model 2\nAccuracy=", clf2t.score(X_test, y_test))

Model 2
Accuracy= 0.9887640449438202


In [62]:
# get predicted y values and probabilities
y_pred2t = clf2t.predict(X_test)
y_pred_proba2t = clf2t.predict_proba(X_test)

In [63]:
# get confusion matrix
model2cmt = confusion_matrix(y_test, y_pred2t)
model2cmt

array([[110,   0],
       [  2,  66]])

Results indicate best performing model is with a max_depth=10 and that this model is not over fit.