# Model Exercises

## Curiculum Model - Logistic Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt

import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic

df = get_titanic_data()
df.head()

In [None]:
# Handle missing values in the `age` column.
df.dropna(inplace=True)

In [None]:
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.info()

In [None]:
# from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [None]:
logit.fit(X_train, y_train)

In [None]:
# Print the coefficients and intercept of the model
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

In [None]:
# Estimate whether or not a passenger would survive, using the training data
y_pred = logit.predict(X_train)

In [None]:
# Estimate the probability of a passenger surviving, using the training data
y_pred_proba = logit.predict_proba(X_train)

In [None]:
# Compute the accuracy
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

In [None]:
# Create a confusion matrix
print(confusion_matrix(y_train, y_pred))

In [None]:
# Compute Precision, Recall, F1-score, and Support
print(classification_report(y_train, y_pred))

Curiculum model = 64% accuracy

### My Baseline calculation

In [None]:
# split df
tdf = get_titanic_data()
tdf.head()

In [None]:
train, validate, test = prep_titanic(tdf)

In [None]:
print(train.shape, validate.shape, test.shape)

In [None]:
train.survived.mean()

In [None]:
train.survived.value_counts()

In [None]:
# died is the majority response - requires human intervention, but gives same result as Ryan's
# positive case = died
my_baseline_accuracy = 307/(307+190)
my_baseline_accuracy

In [None]:
# Ryan's method - can be automated to function
train['baseline_prediction'] = 0
pd.crosstab(train.baseline_prediction, train.survived)

In [None]:
baseline_accuracy = (train.baseline_prediction == train.survived).mean()
baseline_accuracy

Baseline accuracy = 62%

#### 1. Create another model that includes age in addition to fare and pclass. Does this model perform better than your previous one?

In [None]:
# understand the question to mean: create a model that has age, fare, and pclass as only features
logit = LogisticRegression()

In [None]:
train.head()

In [None]:
# X_train = train.drop(columns=['low_tip_target'])
# y_train = train.low_tip_target

# X_validate = validate.drop(columns=['low_tip_target'])
# y_validate = validate.low_tip_target

# X_test = test.drop(columns=['low_tip_target'])
# y_test = test.low_tip_target

X_train_afp = train.drop(columns=['baseline_prediction', 'survived', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_afp = train.survived

X_validate_afp = validate.drop(columns=['survived', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_afp = validate.survived

X_test_afp = test.drop(columns=['survived', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_afp = test.survived

In [None]:
X_train_afp.head()

In [None]:
y_train_afp.head()

In [None]:
# Now fit to X_train, y_train for the attributes age, fare, pclass only
logit_afp = logit.fit(X_train_afp, y_train_afp)

In [None]:
print(logit_afp.coef_)


print(logit_afp.intercept_)

In [None]:
X_train_afp.columns

In [None]:
# Predict values on X_train.
y_pred_afp = logit_afp.predict(X_train_afp)
y_pred_proba_afp = logit_afp.predict_proba(X_train_afp)

In [None]:
# model age, fare, pclass accuracy
logit_afp.score(X_train_afp, y_train_afp)

In [None]:
# confusion matrix
print(confusion_matrix(y_train_afp, y_pred_afp))

In [None]:
# classification report for Model afp
print(classification_report(y_train_afp, y_pred_afp))

This model using age, fare, and pclass only has a 71% accuracy rating. 
Age in this model was filled using imputed values.  

Accuracy:   
So this model performs better than the 61% baseline

#### 2. Include sex in your model as well. Note that you'll need to encode this feature before including it in a model.


In [None]:
# understand the question to mean: create a model that has sex, age, fare, and pclass as features
logit = LogisticRegression()

In [None]:
train.head()

In [None]:
# X_train = train.drop(columns=['low_tip_target'])
# y_train = train.low_tip_target

# X_validate = validate.drop(columns=['low_tip_target'])
# y_validate = validate.low_tip_target

# X_test = test.drop(columns=['low_tip_target'])
# y_test = test.low_tip_target

X_train_safp = train.drop(columns=['baseline_prediction', 'survived', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_safp = train.survived

X_validate_safp = validate.drop(columns=['survived', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_safp = validate.survived

X_test_safp = test.drop(columns=['survived', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_safp = test.survived

In [None]:
X_train_safp.head()

In [None]:
# Now fit to X_train, y_train for the attributes age, fare, pclass only
logit_safp = logit.fit(X_train_safp, y_train_safp)

In [None]:
print(logit_safp.coef_)


print(logit_safp.intercept_)

In [None]:
X_train_safp.columns

In [None]:
# Predict values on X_train.
y_pred_safp = logit_safp.predict(X_train_safp)
y_pred_proba_safp = logit_safp.predict_proba(X_train_safp)

In [None]:
# model sex, age, fare, pclass accuracy
logit_safp.score(X_train_safp, y_train_safp)

This model using sex, age, fare, and pclass only has a 79% accuracy rating.  
Age in this model was filled using imputed values.  

Accuracy:   
So this model performs better than the 61% baseline and better than the model without sex which was 71%

#### 3. Try out other combinations of features and models.

In [None]:
logit = LogisticRegression()

In [None]:
train.head()

In [None]:
# Model pclass as only attribute
X_train_p = train.drop(columns=['baseline_prediction', 'survived', 'age', 'fare', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_p = train.survived

X_validate_p = validate.drop(columns=['survived', 'age', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_p = validate.survived

X_test_p = test.drop(columns=['survived', 'age', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_p = test.survived

In [None]:
# verify pclass is only attribute
X_train_p.head()

In [None]:
# Now fit to X_train, y_train for the attribute pclass only
logit_p = logit.fit(X_train_p, y_train_p)

In [None]:
print(logit_p.coef_)
print(logit_p.intercept_)

In [None]:
# Predict values on X_train.
y_pred_p = logit_p.predict(X_train_p)
y_pred_proba_p = logit_p.predict_proba(X_train_p)

In [None]:
# model sex, age, fare, pclass accuracy
logit_p.score(X_train_p, y_train_p)

This model using pclass only has a 68% accuracy rating.  
Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%  

In [None]:
# Model age as only attribute
X_train_a = train.drop(columns=['baseline_prediction', 'survived', 'pclass', 'fare', 'sex_male', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_a = train.survived

X_validate_a = validate.drop(columns=['survived', 'pclass', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_a = validate.survived

X_test_a = test.drop(columns=['survived', 'pclass', 'fare', 'sex_male',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_a = test.survived

In [None]:
# verify age is only attribute
X_train_a.head()

In [None]:
# Now fit to X_train, y_train for the attribute age only
logit_a = logit.fit(X_train_a, y_train_a)

In [None]:
print(logit_a.coef_)
print(logit_a.intercept_)

In [None]:
# model age accuracy
logit_a.score(X_train_a, y_train_a)

This model using age only has a 61% accuracy rating. Which matches the baseline.  
Age in this model was filled using imputed values.  

Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%    
Age = 61%  

In [None]:
# Model sex as only attribute
X_train_s = train.drop(columns=['baseline_prediction', 'survived', 'pclass', 'fare', 'age', 'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_s = train.survived

X_validate_s = validate.drop(columns=['survived', 'pclass', 'fare', 'age',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_s = validate.survived

X_test_s = test.drop(columns=['survived', 'pclass', 'fare', 'age',  'alone', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_s = test.survived

In [None]:
# verify sex_male is only attribute
X_train_s.head()

In [None]:
# Now fit to X_train, y_train for the attribute sex_male only
logit_s = logit.fit(X_train_s, y_train_s)

In [None]:
print(logit_s.coef_)
print(logit_s.intercept_)

In [None]:
# model sex_male accuracy
logit_s.score(X_train_s, y_train_s)

This model using sex_male only has a 78% accuracy rating.  
Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%    
Age = 61%  
sex_male = 78%  

In [None]:
# Model alone as only attribute
X_train_al = train.drop(columns=['baseline_prediction', 'survived', 'pclass', 'fare', 'age', 'sex_male', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train_al = train.survived

X_validate_al = validate.drop(columns=['survived', 'pclass', 'fare', 'age',  'sex_male', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_validate_al = validate.survived

X_test_al = test.drop(columns=['survived', 'pclass', 'fare', 'age',  'sex_male', 'sibsp', 'parch', 'embark_town', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test_al = test.survived

In [None]:
# verify alone is only attribute
X_train_al.head()

In [None]:
# Now fit to X_train, y_train for the attribute alone only
logit_al = logit.fit(X_train_al, y_train_al)

In [None]:
print(logit_al.coef_)
print(logit_al.intercept_)

In [None]:
# model alone accuracy
logit_al.score(X_train_al, y_train_al)

This model using alone only has a 64% accuracy rating.  
Accuracy:   
Baseline = 61%  
Age, Fare, pclass = 71%  
Sex, Age, Fare, pclass = 79%  
pclass = 68%    
Age = 61%  
sex_male = 78%  
alone = 64%  

#### 4. Choose you best model and evaluate it on the test dataset. Is it overfit?

In [None]:
# editing this question to add validate step. Validate on 2 best models = sex_male only and sex, age, fare, pclass

In [None]:
# model sex, age, fare, pclass validate data
print("model_safp\n", logit_safp.score(X_validate_safp, y_validate_safp))

In [None]:
# model sex_male validate accuracy
logit_s.score(X_validate_s, y_validate_s)

Base on perfomance on the validate data, conclude model with sex, age, fare, and pclass performs the best.  
Run that on the test data

In [None]:
# model sex, age, fare, pclass validate data
print("model_safp\n", logit_safp.score(X_test_safp, y_test_safp))

The accuracy for this model is 80% on the test data.

#### 5. Bonus How do different strategies for handling the missing values in the age column affect model performance?

#### 6. Bonus: How do different strategies for encoding sex affect model performance?

#### 7. Bonus: scikit-learn's LogisticRegression classifier is actually applying a regularization penalty to the coefficients by default. This penalty causes the magnitude of the coefficients in the resulting model to be smaller than they otherwise would be. This value can be modified with the C hyper parameter. Small values of C correspond to a larger penalty, and large values of C correspond to a smaller penalty.

Try out the following values for C and note how the coefficients and the model's performance on both the dataset it was trained on and on the validate split are affected.

C
=
.01
,
.1
,
1
,
10
,
100
,
1000


#### Bonus Bonus: how does scaling the data interact with your choice of C?

## Decission Tree model exercises

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic


In [None]:
# split df
tdf = get_titanic_data()
tdf.head()

In [None]:
train, validate, test = prep_titanic(tdf)
print(train.shape, validate.shape, test.shape)

In [None]:
# Baseline accuracy determination would be the same as logistic regression baseline -- correct?
train.survived.value_counts(normalize=True)

In [None]:
train.head()

In [None]:
# split X and y
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

#### 1. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
# create the decission tree object
# per lesson reccomended to use max_depth=3 for 1st model
clf1 = DecisionTreeClassifier(max_depth=3, random_state=123)

In [None]:
# fit the model
clf1.fit(X_train, y_train)

#### 2. Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [None]:
# get predicted y values and probabilities
y_pred1 = clf1.predict(X_train)
y_pred_proba1 = clf1.predict_proba(X_train)

In [None]:
# get accuracy score
clf1.score(X_train, y_train)

In [None]:
# get confusion matrix
confusion_matrix(y_train, y_pred1)

In [None]:
# better visual of confusion matrix as dataframe
# 0: died, 1: survived
labels = sorted(y_train.unique())

matrix1 = pd.DataFrame(confusion_matrix(y_train, y_pred1), index=labels, columns=labels)
matrix1

In [None]:
print(classification_report(y_train, y_pred1))

#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print("Accuracy=", clf1.score(X_train, y_train))

In [None]:
print("Confusion Matrix Model 1\nPostive=Died")
matrix1 = matrix1.rename(columns={0: 'Died', 1: 'Survived'})
matrix1 = matrix1.rename(index={0: 'Died', 1: 'Survived'})
matrix1

In [None]:
print("True Positive=", matrix1.Died[0])
print("True Negative=", matrix1.Survived[1])
print("False Positive=", matrix1.Died[1])
print("False Negative=", matrix1.Survived[1])

In [None]:
print("Classification Report Model 1")
print(classification_report(y_train, y_pred1))

#### 4. Run through steps 2-4 using a different max_depth value.

In [None]:
# for 2nd model will use max_depth=10
# create the decission tree object
clf2 = DecisionTreeClassifier(max_depth=10, random_state=123)

In [None]:
# fit the model
clf2.fit(X_train, y_train)

In [None]:
# get predicted y values and probabilities
y_pred2 = clf2.predict(X_train)
y_pred_proba2 = clf2.predict_proba(X_train)

In [None]:
# get confusion matrix
model2cm = confusion_matrix(y_train, y_pred2)
model2cm

In [None]:
# better visual of confusion matrix as dataframe
# 0: died, 1: survived
labels = sorted(y_train.unique())

matrix2 = pd.DataFrame(confusion_matrix(y_train, y_pred2), index=labels, columns=labels)
print("Confusion Matrix Model 2\nPostive=Died")
matrix2 = matrix2.rename(columns={0: 'Died', 1: 'Survived'})
matrix2 = matrix2.rename(index={0: 'Died', 1: 'Survived'})
matrix2

In [None]:
print("True Positive=", matrix2.Died[0])
print("True Negative=", matrix2.Survived[1])
print("False Positive=", matrix2.Died[1])
print("False Negative=", matrix2.Survived[1])

In [None]:
# get accuracy score
print("Model 2\nAccuracy=", clf2.score(X_train, y_train))

In [None]:
print("Classification Report Model 2")
print(classification_report(y_train, y_pred2))

In [None]:
# for 3rd model will use max_depth=1
# create the decission tree object
clf3 = DecisionTreeClassifier(max_depth=1, random_state=123)

In [None]:
# fit the model
clf3.fit(X_train, y_train)

In [None]:
# get predicted y values and probabilities
y_pred3 = clf3.predict(X_train)
y_pred_proba3 = clf3.predict_proba(X_train)

In [None]:
# get confusion matrix
model3cm = confusion_matrix(y_train, y_pred3)
model3cm

In [None]:
# better visual of confusion matrix as dataframe
# 0: died, 1: survived
labels = sorted(y_train.unique())

matrix3 = pd.DataFrame(confusion_matrix(y_train, y_pred3), index=labels, columns=labels)
print("Confusion Matrix Model 3\nPostive=Died")
matrix3 = matrix3.rename(columns={0: 'Died', 1: 'Survived'})
matrix3 = matrix3.rename(index={0: 'Died', 1: 'Survived'})
matrix3

In [None]:
# get accuracy score
print("Model 3\nAccuracy=", clf3.score(X_train, y_train))

In [None]:
print("Classification Report Model 3")
print(classification_report(y_train, y_pred3))

#### 5. Which performs better on your in-sample data?

Model 2 with a max_depth=10 has the highest accuracy, however, it is probably overfit.  
Will test Model 1 and 2 both on validate

In [None]:
# Model 1 fit validate
# fit the model
clf1 = clf1.fit(X_validate, y_validate)
# get accuracy score
print("Model 1\nAccuracy=", clf1.score(X_validate, y_validate))

In [None]:
# Model 2 fit validate
# fit the model
clf2v = clf2.fit(X_validate, y_validate)
# get accuracy score
print("Model 2\nAccuracy=", clf2v.score(X_validate, y_validate))

In [None]:
# get predicted y values and probabilities
y_pred2v = clf2v.predict(X_validate)
y_pred_proba2v = clf2v.predict_proba(X_validate)

In [None]:
# get confusion matrix
model2cmv = confusion_matrix(y_validate, y_pred2v)
model2cmv

In [None]:
# Model 2 fit test
# fit the model
clf2t = clf2.fit(X_test, y_test)
# get accuracy score
print("Model 2\nAccuracy=", clf2t.score(X_test, y_test))

In [None]:
# get predicted y values and probabilities
y_pred2t = clf2t.predict(X_test)
y_pred_proba2t = clf2t.predict_proba(X_test)

In [None]:
# get confusion matrix
model2cmt = confusion_matrix(y_test, y_pred2t)
model2cmt

Results indicate best performing model is with a max_depth=10 and that this model is not over fit.

In [None]:
import graphviz

from graphviz import Graph

clf2 = DecisionTreeClassifier(max_depth=10, random_state=123)
clf2.fit(X_train, y_train)
dot_data = export_graphviz(clf2, out_file=None) 
graph = graphviz.Source(dot_data) 

graph.render('titanicm2_decision_tree', view=True)


In [None]:
# to get labels on graphviz
# dot_data = export_graphviz(clf2, feature_names= X.columns, class_names= {0:'not survived', 1:'survived'}, rounded=True, filled=True, out_file=None)

## Random Forest Exercises

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic



In [None]:
# split df
tdf = get_titanic_data()
tdf.head()

In [None]:
train, validate, test = prep_titanic(tdf)
print(train.shape, validate.shape, test.shape)

In [None]:
# Baseline accuracy determination would be the same as logistic regression baseline -- correct?
train.survived.value_counts(normalize=True)

In [None]:
train.head()

In [None]:
# split X and y
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [None]:
rf20 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=20, 
                            random_state=123)

In [None]:
rf20 = rf20.fit(X_train, y_train)

In [None]:
y_predrf20 = rf20.predict(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print("RandomForest20 Accuracy=", rf20.score(X_train, y_train))
print("Confusion Matrix rf20\n", confusion_matrix(y_train, y_predrf20))
print("Classification Report rf20\n", classification_report(y_train, y_predrf20))

3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

# TAKEAWAY: if "0" is positive case use row "0" for recall and precission.   
# if "1" is positive case use row "1" for recall and procission

In [None]:
#tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
#tn, fp, fn, tp = confusion_matrix(y_train, y_pred).ravel()

In [None]:
# save confusion matrix and slice into four pieces
confusionrf20 = confusion_matrix(y_train, y_predrf20)
print("Confusion Matrix rf20\n",confusionrf20, "\n")
#[row, column]
TP = confusionrf20[1, 1]
TN = confusionrf20[0, 0]
FP = confusionrf20[0, 1]
FN = confusionrf20[1, 0]
print("True Positive (count where predicted 1 and actually 1):", TP)
print("True Negative (count where predicted 0 and actually 0):", TN)
print("False Positive (count where predicted 1(survived) and actually 0(died)):", FP)
print("False Negative (count where predicted 0(died) and actually 1(survived)):", FN, "\n")
print("Calculated Precision (TP/TP+FP):", round(TP/(TP+FP), 2))
print("Calculated Recall (TP/TP+FN):", round(TP/(TP+FN), 2), "\n")
print("Classification Report rf20\n", classification_report(y_train, y_predrf20))

#### Summary Random Forest TRAIN resluts for max_depth=20, min_samples_leaf=1, n_estimators=100)   
Accuracy = 98.6%   
see classifiation report

4. Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.


In [None]:
rf3 = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)

rf3 = rf3.fit(X_train, y_train)

y_predrf3 = rf3.predict(X_train)

In [None]:
print("RandomForest3 Accuracy=", rf3.score(X_train, y_train), "\n")
print("Confusion Matrix rf3\n", confusion_matrix(y_train, y_predrf3), "\n")
print("Classification Report rf3\n", classification_report(y_train, y_predrf3), "\n")

5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

Max Depth = 20 gives much better accuracy on in sample data, but is likely to be overfit   
A max depth of 3 gives less accuaracy but is more likely to not be over fit

## KNN Exercises

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import prep_titanic



# split df
tdf = get_titanic_data()
tdf.head()

train, validate, test = prep_titanic(tdf)
print(train.shape, validate.shape, test.shape)

# Baseline accuracy determination would be the same as logistic regression baseline -- correct?
train.survived.value_counts(normalize=True)

train.head()

# split X and y
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

(497, 10) (214, 10) (178, 10)


1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [2]:
# Create KNN Object

# weights = ['uniform', 'density']
knn5 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [3]:
# Fit the model to the training data

knn5.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [4]:
y_predknn5 = knn5.predict(X_train)

2. Evaluate your results using the model score, confusion matrix, and classification report.

In [5]:
print("knn5 Accuracy=", knn5.score(X_train, y_train), "\n")
print("Confusion Matrix knn5\n", confusion_matrix(y_train, y_predknn5), "\n")
print("Classification Report knn5\n", classification_report(y_train, y_predknn5), "\n")

knn5 Accuracy= 0.7766599597585513 

Confusion Matrix knn5
 [[255  52]
 [ 59 131]] 

Classification Report knn5
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       307
           1       0.72      0.69      0.70       190

    accuracy                           0.78       497
   macro avg       0.76      0.76      0.76       497
weighted avg       0.78      0.78      0.78       497
 



3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

#### see above reports, use the classication report for precission and recall

4. Run through steps 2-4 setting k to 10


In [6]:
# Create KNN Object

# weights = ['uniform', 'density']
knn10 = KNeighborsClassifier(n_neighbors=10, weights='uniform')
# Fit the model to the training data
knn10.fit(X_train, y_train)

y_predknn10 = knn10.predict(X_train)

In [7]:
print("knn10 Accuracy=", knn10.score(X_train, y_train), "\n")
print("Confusion Matrix knn10\n", confusion_matrix(y_train, y_predknn10), "\n")
print("Classification Report knn10\n", classification_report(y_train, y_predknn10), "\n")

knn10 Accuracy= 0.7645875251509054 

Confusion Matrix knn10
 [[283  24]
 [ 93  97]] 

Classification Report knn10
               precision    recall  f1-score   support

           0       0.75      0.92      0.83       307
           1       0.80      0.51      0.62       190

    accuracy                           0.76       497
   macro avg       0.78      0.72      0.73       497
weighted avg       0.77      0.76      0.75       497
 



5. Run through setps 2-4 setting k to 20

In [8]:
# Create KNN Object

# weights = ['uniform', 'density']
knn20 = KNeighborsClassifier(n_neighbors=20, weights='uniform')
# Fit the model to the training data
knn20.fit(X_train, y_train)

y_predknn20 = knn20.predict(X_train)

In [9]:
print("knn20 Accuracy=", knn10.score(X_train, y_train), "\n")
print("Confusion Matrix knn20\n", confusion_matrix(y_train, y_predknn20), "\n")
print("Classification Report knn20\n", classification_report(y_train, y_predknn20), "\n")

knn20 Accuracy= 0.7645875251509054 

Confusion Matrix knn20
 [[278  29]
 [111  79]] 

Classification Report knn20
               precision    recall  f1-score   support

           0       0.71      0.91      0.80       307
           1       0.73      0.42      0.53       190

    accuracy                           0.72       497
   macro avg       0.72      0.66      0.66       497
weighted avg       0.72      0.72      0.70       497
 



6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

KNN5 performs best on training data, only slightly higher accuracy than knn10 or knn20, but better recall and precission than the other 2 models

Test

For both the iris and the titanic data,

Determine which model (with hyperparameters) performs the best (try reducing the number of features to the top 4 features in terms of information gained for each feature individually).
Create a new dataframe with top 4 features.
Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results with the training data. Compare your evaluation metrics with those from the original model (with all the features).
Run your final model on your out-of-sample dataframe (test_df). Evaluate the results.

### Irirs Data

### Titanic Data

Feature Engineering

Titanic Data
Create a feature named who, this should be either man, woman, or child. How does including this feature affect your model's performance?
Create a feature named adult_male that is either a 1 or a 0. How does this affect your model's predictions?
Iris Data
Create features named petal_area and sepal_area.