# LOGISTIC REGRESSION

### In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [2]:
# importing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from acquire import new_get_titanic_data
from titanic_model_setup import prep_titanic
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


# using functions from prep and aqcuire to create titanic dataframe
# PLEASE NOTE: a new prep file was created for this exercise in order to avoid altering the
# contents of the original file (as their requirements differ slighltly)
# new file is named 'titanic_model_setup'
df = prep_titanic(new_get_titanic_data())

# display first 5 rows
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,categorical_sex
0,0,3,male,22.0,1,0,7.25,S,2,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,2,1,0
3,1,1,female,35.0,1,0,53.1,S,2,0,0
4,0,3,male,35.0,0,0,8.05,S,2,1,1


In [105]:
# declaring which columns will be included in X and y
X = df[['pclass','fare']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 2) , X_validate:  (214, 2) , X_test:  (179, 2)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [106]:
# going to handle empty values in age by dropping rows

df.dropna(inplace = True)

### Start by defining your baseline model

In [107]:
# checking with class to use for baseline
# 0 = did not survive | 1 = survived
# since there are less survivors than deceased, not survived will be baseline
y_train.survived.value_counts()

0    302
1    196
Name: survived, dtype: int64

In [108]:
# creating dataframe using y_train (survived) data
models = pd.DataFrame(y_train)
# adding column that will hold baseline values (0)
models['baseline'] = 0
# renaming first column
models.columns = ['actual','baseline']
# displaying results
models.head()

Unnamed: 0,actual,baseline
689,1,0
84,1,0
738,0,0
441,0,0
643,1,0


In [109]:
# calculating baseline accuracy via creating boolean mask that holds true for rows where the 
# baseline matches the actual value
# mean tells use the % of rows where this occured, ie. % of rows where baseline model was correct
baseline_accuracy = (models.baseline == models.actual).mean()

# printing results, baseline had accuracy of 61%
print(f'The baseline model is {round(baseline_accuracy,2) * 100}% accurate')


The baseline model is 61.0% accurate


#### Create another model that includes age in addition to fare and pclass. 

In [147]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [148]:
# declaring which columns will be included in X and y
# adding age this time
X = df[['pclass','fare', 'age']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 3) , X_validate:  (214, 3) , X_test:  (179, 3)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [149]:
# going to handle empty values in age by imputing with the most common age

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [150]:
# creating logistic regression object
logit_1 = LogisticRegression()

In [151]:
# fitting model to training data
logit_1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [152]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit_1.coef_)
print('Intercept: \n', logit_1.intercept_)

Coefficient: 
 [[-0.94438149  0.00314466 -0.02819317]]
Intercept: 
 [2.41877072]


In [153]:
# estimate whether or not a person would survive based on the training data
y_pred = logit_1.predict(X_train)

In [154]:
# estimate p of person surviving
y_pred_proba = logit_1.predict_proba(X_train)

In [155]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_1.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.71


In [156]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[259  43]
 [101  95]]


In [157]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78       302
           1       0.69      0.48      0.57       196

    accuracy                           0.71       498
   macro avg       0.70      0.67      0.68       498
weighted avg       0.71      0.71      0.70       498



#### Does this model perform better than your previous one?

In [158]:
# Answer: Yes, this model has an accuracy of 71%, this is 10% higher than the baseline model

#### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [159]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [160]:
# declaring which columns will be included in X and y
# adding age and sex this time
X = df[['pclass','fare', 'age', 'categorical_sex']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 4) , X_validate:  (214, 4) , X_test:  (179, 4)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [161]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [162]:
# creating logistic regression object
logit_2 = LogisticRegression()

In [163]:
# fitting model to training data
logit_2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [164]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit_2.coef_)
print('Intercept: \n', logit_2.intercept_)

Coefficient: 
 [[-1.12616011e+00  1.32701409e-04 -2.12849959e-02 -2.39973517e+00]]
Intercept: 
 [4.18183443]


In [165]:
# estimate whether or not a person would survive based on the training data
y_pred = logit_2.predict(X_train)

In [166]:
# estimate p of person surviving
y_pred_proba = logit_2.predict_proba(X_train)

In [167]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_2.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.78


In [168]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[246  56]
 [ 53 143]]


In [169]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.82       302
           1       0.72      0.73      0.72       196

    accuracy                           0.78       498
   macro avg       0.77      0.77      0.77       498
weighted avg       0.78      0.78      0.78       498



#### Try out other combinations of features and models.

In [188]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [189]:
# declaring which columns will be included in X and y
# adding age, sex and whether the passenger was alone this time
X = df[['pclass','fare', 'age', 'categorical_sex', 'alone']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 5) , X_validate:  (214, 5) , X_test:  (179, 5)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [190]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [191]:
# creating 3 different logistic regression objects with different solver arguments
# Note: models 4 and 5 will be used for upcoming question # 5
logit_3 = LogisticRegression(solver = 'lbfgs')
logit_4 = LogisticRegression(solver = 'liblinear')
logit_5 = LogisticRegression(solver = 'newton-cg')

In [192]:
# fitting models to training data 
# Note: models 4 and 5 will be used for upcoming question # 5
logit_3.fit(X_train, y_train)
logit_4.fit(X_train, y_train)
logit_5.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [193]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit_3.coef_)
print('Intercept: \n', logit_3.intercept_)

Coefficient: 
 [[-1.12726009e+00 -3.21796518e-04 -2.03922298e-02 -2.35677691e+00
  -2.08399581e-01]]
Intercept: 
 [4.26875038]


In [200]:
# estimate whether or not a person would survive based on the training data
y_pred = logit_3.predict(X_train)

In [201]:
# estimate p of person surviving
y_pred_proba = logit_3.predict_proba(X_train)

In [202]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_3.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.79


In [203]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[251  51]
 [ 56 140]]


In [198]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       302
           1       0.73      0.71      0.72       196

    accuracy                           0.79       498
   macro avg       0.78      0.77      0.77       498
weighted avg       0.78      0.79      0.78       498



#### Use you best 3 models to predict and evaluate on your validate sample.

In [199]:
# evaluating models on sample data
y_pred_3 = logit_3.predict(X_validate)
y_pred_4 = logit_4.predict(X_validate)
y_pred_5 = logit_5.predict(X_validate)

# printing accuracy, confusion matrix, and classification report for model 3
print("Model 3:")
print('Accuracy: {:.2f}'.format(logit_3.score(X_validate, y_validate)))
print(confusion_matrix(y_validate, y_pred_3))
print(classification_report(y_validate, y_pred_3))

# printing accuracy, confusion matrix, and classification report for model 4
print("Model 4:")
print('Accuracy: {:.2f}'.format(logit_4.score(X_validate, y_validate)))
print(confusion_matrix(y_validate, y_pred_4))
print(classification_report(y_validate, y_pred_4))

# printing accuracy, confusion matrix, and classification report for model 5
print("Model 5:")
print('Accuracy: {:.2f}'.format(logit_5.score(X_validate, y_validate)))
print(confusion_matrix(y_validate, y_pred_5))
print(classification_report(y_validate, y_pred_5))

Model 3:
Accuracy: 0.80
[[114  19]
 [ 23  58]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       133
           1       0.75      0.72      0.73        81

    accuracy                           0.80       214
   macro avg       0.79      0.79      0.79       214
weighted avg       0.80      0.80      0.80       214

Model 4:
Accuracy: 0.81
[[115  18]
 [ 23  58]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       133
           1       0.76      0.72      0.74        81

    accuracy                           0.81       214
   macro avg       0.80      0.79      0.79       214
weighted avg       0.81      0.81      0.81       214

Model 5:
Accuracy: 0.80
[[114  19]
 [ 23  58]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       133
           1       0.75      0.72      0.73        81

    accuracy                           0.

In [None]:
# Answer: Model 4 is ~1% more accurate than models 3 and 5

#### Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [204]:
# evaluating model 4 on train, validate and test data
y_pred_4_train = logit_4.predict(X_train)
y_pred_4_val = logit_4.predict(X_validate)
y_pred_4_test = logit_4.predict(X_test)

In [210]:
# printing classification report for model 4 vs train data 
print('Classification report of Model 4 applied to train data\n')
print(classification_report(y_train, y_pred_4_train))

Classification report of Model 4 applied to train data

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       302
           1       0.72      0.68      0.70       196

    accuracy                           0.77       498
   macro avg       0.76      0.75      0.76       498
weighted avg       0.77      0.77      0.77       498



In [209]:
# printing classification report for model 4 vs validate data 
print('Classification report of Model 4 applied to validate data\n')
print(classification_report(y_validate, y_pred_4_val))

Classification report of Model 4 applied to validate data

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       133
           1       0.76      0.72      0.74        81

    accuracy                           0.81       214
   macro avg       0.80      0.79      0.79       214
weighted avg       0.81      0.81      0.81       214



In [211]:
# printing classification report for model 4 vs test data 
print('Classification report of Model 4 applied to test data\n')
print(classification_report(y_test, y_pred_4_test))

Classification report of Model 4 applied to test data

              precision    recall  f1-score   support

           0       0.84      0.85      0.84       114
           1       0.73      0.71      0.72        65

    accuracy                           0.80       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



In [1]:
# Answer: Model 4 showcased higher accuracy when applied to the validate and test data than it did when
# it was applied to the train data. Overall it had the highest accuracy on the validate data set. 

# DECISION TREE

#### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [106]:
# importing data
df = prep_titanic(new_get_titanic_data())

# declaring which columns will be included in X and y
# adding age, sex and whether the passenger was alone this time
X = df[['fare','categorical_sex', 'pclass']]
y = df[['survived']]

# creating test, validate and train DFs
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 3) , X_validate:  (214, 3) , X_test:  (179, 3)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [123]:
# importing decision tree function
from sklearn.tree import DecisionTreeClassifier

# setting depth and state for function
clf = DecisionTreeClassifier(max_depth = 1, random_state = 123)

In [124]:
# fitting model to data
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [125]:
# making predictions on whether passengers survived
y_pred_1 = clf.predict(X_train)
y_pred_1[0:5]

array([1, 1, 0, 0, 0])

In [126]:
# probabilities of survival
y_pred_1_proba = clf.predict_proba(X_train)
y_pred_1_proba[0:5]

array([[0.26404494, 0.73595506],
       [0.26404494, 0.73595506],
       [0.796875  , 0.203125  ],
       [0.796875  , 0.203125  ],
       [0.796875  , 0.203125  ]])

#### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [127]:
# printing accuracy of model 1
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.78


In [128]:
# creating confusion matrix
confusion_matrix(y_train, y_pred_1)

array([[255,  47],
       [ 65, 131]])

In [129]:
# printing classification report
print(classification_report(y_train, y_pred_1))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       302
           1       0.74      0.67      0.70       196

    accuracy                           0.78       498
   macro avg       0.77      0.76      0.76       498
weighted avg       0.77      0.78      0.77       498



#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [149]:
# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_1).ravel()

In [176]:
# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# displaying results
print('Accuracy of model 1 is',accuracy) # overall accuracy, positive and negative predictions

Accuracy of model 1 is 0.7751004016064257


In [175]:
# calculating recall
recall = tp / (tp + fn)

# displaying results
print('Recall aka true positive rate of model 1 is',recall) # aka true positive rate aka sensitivity

Recall aka true positive rate of model 1 is 0.6683673469387755


In [178]:
# calculating specificity
specificity = tn / (tn + fp) 

# displaying results
print('True negative rate of model 1 is',specificity) # aka true negative rate

True negative rate of model 1 is 0.8443708609271523


In [179]:
from sklearn.metrics import f1_score

# calculating f score
f1 = f1_score(y_train, y_pred_1)

# displaying results
print('F1 score of model 1 is',f1) # the mean of precision and recall

F1 score of model 1 is 0.7005347593582888


In [180]:
# calculating precision
precision = tp / (tp + fp)

# displaying results
print('precision of model 1 is',precision)

precision of model 1 is 0.7359550561797753


In [181]:
# calculating false positive rate
fallout = fp / (fp + tn)

# displaying results
print('False positive rate of model 1 is',fallout) # false positive rate 

False positive rate of model 1 is 0.15562913907284767


In [182]:
# calculating false negative rate
miss_rate = fn / (fn + tp)

# displaying results
print('False negative rate of model 1 is',miss_rate) # false negative rate

False negative rate of model 1 is 0.33163265306122447


In [184]:
from sklearn.metrics import precision_recall_fscore_support

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_1)

# displaying support
print('support values of model 1 are',support)

support values of model 1 are [302 196]


#### Run through steps 2-4 using a different max_depth value.

In [130]:
# setting depth and state for function (increasing depth to 3)
clf = DecisionTreeClassifier(max_depth = 3, random_state = 123)

In [131]:
# fitting model to data
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [132]:
# predicting whether each passenger survives or not
y_pred_2 = clf.predict(X_train)
y_pred_2[0:5]

array([1, 1, 0, 0, 0])

In [133]:
# probability of survival for each passenger
y_pred_2_proba = clf.predict_proba(X_train)
y_pred_2_proba[0:5]

array([[0.01587302, 0.98412698],
       [0.1       , 0.9       ],
       [0.93684211, 0.06315789],
       [0.81699346, 0.18300654],
       [0.81699346, 0.18300654]])

In [134]:
# printing accuracy of model 2
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [135]:
# creating confusion matrix
confusion_matrix(y_train, y_pred_2)

array([[264,  38],
       [ 67, 129]])

In [136]:
# printing classification report
print(classification_report(y_train, y_pred_2))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       302
           1       0.77      0.66      0.71       196

    accuracy                           0.79       498
   macro avg       0.79      0.77      0.77       498
weighted avg       0.79      0.79      0.79       498



In [195]:
# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_2).ravel()

In [196]:
# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# displaying results
print('Accuracy of model 2 is',accuracy) # overall accuracy, positive and negative predictions

Accuracy of model 2 is 0.7891566265060241


In [197]:
# calculating recall
recall = tp / (tp + fn)

# displaying results
print('Recall aka true positive rate of model 2 is',recall) # aka true positive rate aka sensitivity

Recall aka true positive rate of model 2 is 0.6581632653061225


In [198]:
# calculating specificity
specificity = tn / (tn + fp) 

# displaying results
print('True negative rate of model 2 is',specificity) # aka true negative rate

True negative rate of model 2 is 0.8741721854304636


In [199]:
# calculating f score
f1 = f1_score(y_train, y_pred_2)

# displaying results
print('F1 score of model 2 is',f1) # the mean of precision and recall

F1 score of model 2 is 0.7107438016528926


In [200]:
# calculating precision
precision = tp / (tp + fp)

# displaying results
print('precision of model 2 is',precision)

precision of model 2 is 0.7724550898203593


In [201]:
# calculating false positive rate
fallout = fp / (fp + tn)

# displaying results
print('False positive rate of model 2 is',fallout) # false positive rate 

False positive rate of model 2 is 0.12582781456953643


In [202]:
# calculating false negative rate
miss_rate = fn / (fn + tp)

# displaying results
print('False negative rate of model 2 is',miss_rate) # false negative rate

False negative rate of model 2 is 0.34183673469387754


In [203]:
from sklearn.metrics import precision_recall_fscore_support

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_2)

# displaying support
print('support values of model 2 are',support)

support values of model 2 are [302 196]


#### Which performs better on your in-sample data?

In [204]:
# Answer: Model 2's accuracy was 1% higher so it performed better

print('Answer: Model 2s accuracy was 1% higher so it performed better')

Answer: Model 2s accuracy was 1% higher so it performed better


# RANDOM FOREST

#### Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 20.

In [166]:
from sklearn.ensemble import RandomForestClassifier

# importing data
df = prep_titanic(new_get_titanic_data())

# setting X and Y features
X = df[['pclass','age','fare','sibsp','parch']]
y = df[["survived"]]

# splitting DF into test and train/validate
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123, stratify=y.survived)

# splitting train/validate into separate train and validate DFs
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size=0.30, random_state = 123, stratify=y_train_validate)

# examining a few rows of train df
X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
583,1,36.0,40.125,0,0
165,3,9.0,20.525,0,2
50,3,7.0,39.6875,4,1
259,2,50.0,26.0,0,1
306,1,,110.8833,0,0


In [167]:
# going to handle empty values in age by imputing with the most common age

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [168]:
# creating random forest object with specified depth and min samples leaf (model 1)
rf = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 1,
                            n_estimators = 100,
                            max_depth = 20, 
                            random_state = 123)

In [169]:
# fitting model to training data
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [170]:
# printing feature importances
print(rf.feature_importances_)

[0.08782238 0.36894488 0.42682963 0.06658478 0.04981834]


In [171]:
# predicting wether each passenger would survive
y_pred_1 = rf.predict(X_train)

In [172]:
# estimating p of each passenger surviving
y_pred_proba_1 = rf.predict_proba(X_train)

#### Evaluate your results using the model score, confusion matrix, and classification report.

In [173]:
# printing model score
print('Accuracy of model 1 on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of model 1 on training set: 0.98


In [174]:
# creating confusion matrix
print(confusion_matrix(y_train, y_pred_1))

[[303   4]
 [  8 183]]


In [175]:
# creating classification report
print(classification_report(y_train, y_pred_1))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       307
           1       0.98      0.96      0.97       191

    accuracy                           0.98       498
   macro avg       0.98      0.97      0.97       498
weighted avg       0.98      0.98      0.98       498



#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [176]:
# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_1).ravel()

# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# displaying results
print('Accuracy of model 1 is', accuracy,'\n') # overall accuracy, positive and negative predictions

# calculating recall
recall = tp / (tp + fn)

# displaying results
print('Recall aka true positive rate of model 1 is',recall,'\n') # aka true positive rate aka sensitivity

# calculating specificity
specificity = tn / (tn + fp) 

# displaying results
print('True negative rate of model 1 is',specificity,'\n') # aka true negative rate

# calculating f score
f1 = f1_score(y_train, y_pred_1)

# displaying results
print('F1 score of model 1 is',f1,'\n') # the mean of precision and recall

# calculating precision
precision = tp / (tp + fp)

# displaying results
print('precision of model 1 is',precision,'\n')

# calculating false positive rate
fallout = fp / (fp + tn)

# displaying results
print('False positive rate of model 1 is',fallout,'\n') # false positive rate

# calculating false negative rate
miss_rate = fn / (fn + tp)

# displaying results
print('False negative rate of model 1 is',miss_rate,'\n') # false negative rate

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_1)

# displaying support
print('support values of model 1 are',support)

Accuracy of model 1 is 0.9759036144578314 

Recall aka true positive rate of model 1 is 0.9581151832460733 

True negative rate of model 1 is 0.9869706840390879 

F1 score of model 1 is 0.9682539682539684 

precision of model 1 is 0.9786096256684492 

False positive rate of model 1 is 0.013029315960912053 

False negative rate of model 1 is 0.041884816753926704 

support values of model 1 are [307 191]


#### Run through steps increasing your min_samples_leaf to 5 and decreasing your max_depth to 3.

In [177]:
# creating random forest object with depth 3 and min samples leaf of 5 (model 2)
rf2 = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 5,
                            n_estimators = 100,
                            max_depth = 3, 
                            random_state = 123)

In [178]:
# fitting model to training data
rf2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=3, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [179]:
# printing feature importances
print(rf2.feature_importances_)

[0.28927847 0.16919982 0.37805494 0.11157186 0.05189491]


In [180]:
# predicting wether each passenger would survive
y_pred_2 = rf2.predict(X_train)

In [181]:
# estimating p of each passenger surviving
y_pred_proba_2 = rf2.predict_proba(X_train)

In [182]:
# printing model score
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.73


In [183]:
# creating confusion matrix
print(confusion_matrix(y_train, y_pred_1))

[[303   4]
 [  8 183]]


In [184]:
# creating classification report
print(classification_report(y_train, y_pred_1))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       307
           1       0.98      0.96      0.97       191

    accuracy                           0.98       498
   macro avg       0.98      0.97      0.97       498
weighted avg       0.98      0.98      0.98       498



In [185]:
# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_2).ravel()

# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# displaying results
print('Accuracy of model 2 is', accuracy,'\n') # overall accuracy, positive and negative predictions

# calculating recall
recall = tp / (tp + fn)

# displaying results
print('Recall aka true positive rate of model 2 is',recall,'\n') # aka true positive rate aka sensitivity

# calculating specificity
specificity = tn / (tn + fp) 

# displaying results
print('True negative rate of model 2 is',specificity,'\n') # aka true negative rate

# calculating f score
f1 = f1_score(y_train, y_pred_2)

# displaying results
print('F1 score of model 2 is',f1,'\n') # the mean of precision and recall

# calculating precision
precision = tp / (tp + fp)

# displaying results
print('precision of model 2 is',precision,'\n')

# calculating false positive rate
fallout = fp / (fp + tn)

# displaying results
print('False positive rate of model 2 is',fallout,'\n') # false positive rate

# calculating false negative rate
miss_rate = fn / (fn + tp)

# displaying results
print('False negative rate of model 2 is',miss_rate,'\n') # false negative rate

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_2)

# displaying support
print('support values of model 2 are',support)

Accuracy of model 2 is 0.7349397590361446 

Recall aka true positive rate of model 2 is 0.5602094240837696 

True negative rate of model 2 is 0.8436482084690554 

F1 score of model 2 is 0.6184971098265897 

precision of model 2 is 0.6903225806451613 

False positive rate of model 2 is 0.1563517915309446 

False negative rate of model 2 is 0.4397905759162304 

support values of model 2 are [307 191]


#### What are the differences in the evaluation metrics? 

In [186]:
# model 1 has accuracy ~20% higher, recall ~50% higher, precision ~20% higher
print('model 1 has accuracy ~20% higher, recall ~50% higher, precision ~20% higher')

model 1 has accuracy ~20% higher, recall ~50% higher, precision ~20% higher


#### Which performs better on your in-sample data?

In [187]:
# model 1 performs better

#### Why?

In [188]:
# model 1 has a very high max depth of 20. This means that its probably overfitted for the data and although
# it performs well on train (the sample its overfitted on), it will perform poorly on an out of sample dataset.

#### After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [189]:
# creating random forest object with depth 1 and min samples leaf of 4 (model 3)
rf3 = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 5,
                            n_estimators = 100,
                            max_depth = 10, 
                            random_state = 123)

# fitting model 3 to training data
rf3.fit(X_train, y_train)

# predicting wether each passenger would survive
y_pred_3 = rf3.predict(X_train)

# printing model score
print('Accuracy of model 3 on training set: {:.2f}'
     .format(rf3.score(X_train, y_train)))

Accuracy of model 3 on training set: 0.80


In [190]:
# creating random forest object with depth 1 and min samples leaf of 4 (model 4)
rf4 = RandomForestClassifier(bootstrap = True, 
                            class_weight = None, 
                            criterion = 'gini',
                            min_samples_leaf = 10,
                            n_estimators = 100,
                            max_depth = 5, 
                            random_state = 123)

# fitting model 3 to training data
rf4.fit(X_train, y_train)

# predicting wether each passenger would survive
y_pred_4 = rf4.predict(X_train)

# printing model score
print('Accuracy of model 4 on training set: {:.2f}'
     .format(rf4.score(X_train, y_train)))

Accuracy of model 4 on training set: 0.75


In [191]:
print('Training sample - accuracy summary')

# printing accuracy of model 1 vs training sample
print('Accuracy of model 1 on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

# printing accuracy of model 2 vs training sample
print('Accuracy of model 2 on training set: {:.2f}'
     .format(rf2.score(X_train, y_train)))

# printing accuracy of model 3 vs training sample
print('Accuracy of model 3 on training set: {:.2f}'
     .format(rf3.score(X_train, y_train)))

# printing accuracy of model 4 vs training sample
print('Accuracy of model 4 on training set: {:.2f}'
     .format(rf4.score(X_train, y_train)))

Training sample - accuracy summary
Accuracy of model 1 on training set: 0.98
Accuracy of model 2 on training set: 0.73
Accuracy of model 3 on training set: 0.80
Accuracy of model 4 on training set: 0.75


In [192]:
print('Validate sample - accuracy summary')

# printing accuracy of model 1 vs validation 
print('Accuracy of model 1 on validation set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

# printing accuracy of model 2 vs validation 
print('Accuracy of model 2 on validation set: {:.2f}'
     .format(rf2.score(X_validate, y_validate)))

# printing accuracy of model 3 vs validation 
print('Accuracy of model 3 on validation set: {:.2f}'
     .format(rf3.score(X_validate, y_validate)))

# printing accuracy of model 4 vs validation 
print('Accuracy of model 4 on validation set: {:.2f}'
     .format(rf4.score(X_validate, y_validate)))

Validate sample - accuracy summary
Accuracy of model 1 on validateing set: 0.72
Accuracy of model 2 on validateing set: 0.75
Accuracy of model 3 on validateing set: 0.77
Accuracy of model 4 on validateing set: 0.74


In [215]:
# Answer: Model 2 had the smallest difference between its training and validate accuracy (1%).

print('Answer: Model 4 had the smallest difference between its training and validate accuracy, 1%.') 

Answer: Model 4 had the smallest difference between its training and validate accuracy, 1%.


# KNN

#### Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [199]:
# importing data
df = prep_titanic(new_get_titanic_data())

# setting X and Y features
X = df[['pclass','age','fare','sibsp','parch']]
y = df[["survived"]]

# splitting DF into test and train/validate
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123, stratify=y.survived)

# splitting train/validate into separate train and validate DFs
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size=0.30, random_state = 123, stratify=y_train_validate)

# examining a few rows of train df
X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
583,1,36.0,40.125,0,0
165,3,9.0,20.525,0,2
50,3,7.0,39.6875,4,1
259,2,50.0,26.0,0,1
306,1,,110.8833,0,0


In [203]:
# going to handle empty values in age by imputing with the most common age

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [208]:
# from sklearn.neighbors import KNeighborsClassifier
knn1 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

# fitting model to train sample
knn1.fit(X_train, y_train)

y_pred1 = knn1.predict(X_train)

y_pred_proba = knn1.predict_proba(X_train)

#### Evaluate your results using the model score, confusion matrix, and classification report.

In [213]:
# printing accuracy of model 1
print('Accuracy of KNN classifier Model 1 on training set: {:.2f}'
     .format(knn1.score(X_train, y_train)))

# printing confusion matrix of model 1
print('\nConfusion Matrix of KNN Model 1')
print(confusion_matrix(y_train, y_pred1))

# printing classification report of model 1
print('\nClassification report of KNN Model 1')
print(classification_report(y_train, y_pred1))

Accuracy of KNN classifier Model 1 on training set: 0.76

Confusion Matrix of KNN Model 1
[[251  56]
 [ 64 127]]

Classification report of KNN Model 1
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       307
           1       0.69      0.66      0.68       191

    accuracy                           0.76       498
   macro avg       0.75      0.74      0.74       498
weighted avg       0.76      0.76      0.76       498



#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [214]:
# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred1).ravel()

# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)
# displaying results
print('Accuracy of model 1 is', accuracy,'\n') # overall accuracy, positive and negative predictions

# calculating recall
recall = tp / (tp + fn)
# displaying results
print('Recall aka true positive rate of model 1 is',recall,'\n') # aka true positive rate aka sensitivity

# calculating specificity
specificity = tn / (tn + fp) 
# displaying results
print('True negative rate of model 1 is',specificity,'\n') # aka true negative rate

# calculating f score
f1 = f1_score(y_train, y_pred_1)
# displaying results
print('F1 score of model 1 is',f1,'\n') # the mean of precision and recall

# calculating precision
precision = tp / (tp + fp)
# displaying results
print('Precision of model 1 is',precision,'\n')

# calculating false positive rate
fallout = fp / (fp + tn)
# displaying results
print('False positive rate of model 1 is',fallout,'\n') # false positive rate

# calculating false negative rate
miss_rate = fn / (fn + tp)
# displaying results
print('False negative rate of model 1 is',miss_rate,'\n') # false negative rate

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_1)
# displaying support
print('Support values of model 1 are',support)

Accuracy of model 1 is 0.7590361445783133 

Recall aka true positive rate of model 1 is 0.6649214659685864 

True negative rate of model 1 is 0.8175895765472313 

F1 score of model 1 is 0.9682539682539684 

Precision of model 1 is 0.6939890710382514 

False positive rate of model 1 is 0.18241042345276873 

False negative rate of model 1 is 0.33507853403141363 

Support values of model 1 are [307 191]


#### Run through steps 2-4 setting k to 10

In [217]:
# from sklearn.neighbors import KNeighborsClassifier
knn2 = KNeighborsClassifier(n_neighbors=10, weights='uniform')

# fitting model to train sample
knn2.fit(X_train, y_train)

# predicting whether each passenger survives
y_pred2 = knn2.predict(X_train)

# p of each passenger surviving 
y_pred_proba = knn2.predict_proba(X_train)

# printing accuracy of model 2
print('Accuracy of KNN classifier model 2 on training set: {:.2f}'
     .format(knn2.score(X_train, y_train)))

# printing confusion matrix of model 2
print('\nConfusion Matrix of KNN model 2')
print(confusion_matrix(y_train, y_pred2))

# printing classification report of model 2
print('\nClassification report of KNN model 2')
print(classification_report(y_train, y_pred2))

# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred2).ravel()

# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)
# displaying results
print('Accuracy of model 2 is', accuracy,'\n') # overall accuracy, positive and negative predictions

# calculating recall
recall = tp / (tp + fn)
# displaying results
print('Recall aka true positive rate of model 2 is',recall,'\n') # aka true positive rate aka sensitivity

# calculating specificity
specificity = tn / (tn + fp) 
# displaying results
print('True negative rate of model 2 is',specificity,'\n') # aka true negative rate

# calculating f score
f1 = f1_score(y_train, y_pred_1)
# displaying results
print('F1 score of model 2 is',f1,'\n') # the mean of precision and recall

# calculating precision
precision = tp / (tp + fp)
# displaying results
print('Precision of model 2 is',precision,'\n')

# calculating false positive rate
fallout = fp / (fp + tn)
# displaying results
print('False positive rate of model 2 is',fallout,'\n') # false positive rate

# calculating false negative rate
miss_rate = fn / (fn + tp)
# displaying results
print('False negative rate of model 2 is',miss_rate,'\n') # false negative rate

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_1)
# displaying support
print('Support values of model 2 are',support)

Accuracy of KNN classifier model 2 on training set: 0.72

Confusion Matrix of KNN model 2
[[277  30]
 [108  83]]

Classification report of KNN model 2
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       307
           1       0.73      0.43      0.55       191

    accuracy                           0.72       498
   macro avg       0.73      0.67      0.67       498
weighted avg       0.73      0.72      0.70       498

Accuracy of model 2 is 0.7228915662650602 

Recall aka true positive rate of model 2 is 0.43455497382198954 

True negative rate of model 2 is 0.9022801302931596 

F1 score of model 2 is 0.9682539682539684 

Precision of model 2 is 0.7345132743362832 

False positive rate of model 2 is 0.09771986970684039 

False negative rate of model 2 is 0.5654450261780105 

Support values of model 2 are [307 191]


#### Run through setps 2-4 setting k to 20

In [219]:
# from sklearn.neighbors import KNeighborsClassifier
knn3 = KNeighborsClassifier(n_neighbors=20, weights='uniform')

# fitting model to train sample
knn3.fit(X_train, y_train)

# predicting whether each passenger survives
y_pred3 = knn3.predict(X_train)

# p of each passenger surviving 
y_pred_proba = knn3.predict_proba(X_train)

# printing accuracy of model 3
print('Accuracy of KNN classifier model 3 on training set: {:.2f}'
     .format(knn3.score(X_train, y_train)))

# printing confusion matrix of model 3
print('\nConfusion Matrix of KNN model 3')
print(confusion_matrix(y_train, y_pred3))

# printing classification report of model 3
print('\nClassification report of KNN model 3')
print(classification_report(y_train, y_pred3))

# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred3).ravel()

# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)
# displaying results
print('Accuracy of model 3 is', accuracy,'\n') # overall accuracy, positive and negative predictions

# calculating recall
recall = tp / (tp + fn)
# displaying results
print('Recall aka true positive rate of model 3 is',recall,'\n') # aka true positive rate aka sensitivity

# calculating specificity
specificity = tn / (tn + fp) 
# displaying results
print('True negative rate of model 3 is',specificity,'\n') # aka true negative rate

# calculating f score
f1 = f1_score(y_train, y_pred_1)
# displaying results
print('F1 score of model 3 is',f1,'\n') # the mean of precision and recall

# calculating precision
precision = tp / (tp + fp)
# displaying results
print('Precision of model 3 is',precision,'\n')

# calculating false positive rate
fallout = fp / (fp + tn)
# displaying results
print('False positive rate of model 3 is',fallout,'\n') # false positive rate

# calculating false negative rate
miss_rate = fn / (fn + tp)
# displaying results
print('False negative rate of model 3 is',miss_rate,'\n') # false negative rate

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_1)
# displaying support
print('Support values of model 3 are',support)

Accuracy of KNN classifier model 3 on training set: 0.71

Confusion Matrix of KNN model 3
[[277  30]
 [115  76]]

Classification report of KNN model 3
              precision    recall  f1-score   support

           0       0.71      0.90      0.79       307
           1       0.72      0.40      0.51       191

    accuracy                           0.71       498
   macro avg       0.71      0.65      0.65       498
weighted avg       0.71      0.71      0.68       498

Accuracy of model 3 is 0.7088353413654619 

Recall aka true positive rate of model 3 is 0.39790575916230364 

True negative rate of model 3 is 0.9022801302931596 

F1 score of model 3 is 0.9682539682539684 

Precision of model 3 is 0.7169811320754716 

False positive rate of model 3 is 0.09771986970684039 

False negative rate of model 3 is 0.6020942408376964 

Support values of model 3 are [307 191]


#### What are the differences in the evaluation metrics? 

In [222]:
print('All 3 models have similar accuracy (70-75%)')
print('With regard to recall, models 2 and 3 are around 40% while model 1 has 66%')
print('All 3 have similar F1 scores (~96%)')
print('All 3 have similar precision (70-73%')

All 3 models have similar accuracy (70-75%)
With regard to recall, models 2 and 3 are around 40% while model 1 has 66%
All 3 have similar F1 scores (~96%)
All 3 have similar precision (70-73%


#### Which performs better on your in-sample data? 

In [223]:
print('Model 1 performs the best because it has similar metrics to the other 2 models while having a recall rate that is ~22% higher')

Model 1 performs the best because it has similar metrics to the other 2 models while having a recall rate that is ~22% higher


#### Why?

In [225]:
print('Model 1 has the highest recall because it had the lowest rate of false positives. We can reason that this was due to its neighbor count being lower than the others.')

Model 1 has the highest recall because it had the lowest rate of false positives. We can reason that this was due to its neighbor count being lower than the others.


# KNN TEST

#### Determine which model (with hyperparameters) performs the best (try reducing the number of features to the top 4 features in terms of information gained for each feature individually).

In [None]:
# importing data
df = prep_titanic(new_get_titanic_data())

# declaring which columns will be included in X and y
X = df[['pclass','fare', 'age', 'categorical_sex']]
y = df[['survived']]

# creating test, validate and train DFs
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

In [7]:
# going to handle empty values in age by imputing with the most common age

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [24]:
# Creating / fitting model 1

# from sklearn.neighbors import KNeighborsClassifier
knn1 = KNeighborsClassifier(n_neighbors=5, weights='uniform')

# fitting model to train sample
knn1.fit(X_train, y_train)

# using model to make predictions of survival using X_train data
y_pred1 = knn1.predict(X_train)

model1_acc = round(knn1.score(X_train, y_train),2)

# printing accuracy of model 1
print('Accuracy of KNN classifier model 1 on training set:', model1_acc)

Accuracy of KNN classifier model 1 on training set: 0.77


In [13]:
# Creating / fitting model 2

# from sklearn.neighbors import KNeighborsClassifier
knn2 = KNeighborsClassifier(n_neighbors=10, algorithm = 'ball_tree', weights='uniform')

# fitting model to train sample
knn2.fit(X_train, y_train)

# using model to make predictions of survival using X_train data
y_pred2 = knn2.predict(X_train)

# printing accuracy of model 2
print('Accuracy of KNN classifier model 2 on training set: {:.2f}'
     .format(knn2.score(X_train, y_train)))

Accuracy of KNN classifier model 2 on training set: 0.71


In [15]:
# Creating / fitting model 3

# from sklearn.neighbors import KNeighborsClassifier
knn3 = KNeighborsClassifier(n_neighbors=10, algorithm = 'ball_tree', leaf_size = 5, weights='uniform')

# fitting model to train sample
knn3.fit(X_train, y_train)

# using model to make predictions of survival using X_train data
y_pred3 = knn3.predict(X_train)


# printing accuracy of model 2
print('Accuracy of KNN classifier model 3 on training set: {:.2f}'
     .format(knn3.score(X_train, y_train)))

Accuracy of KNN classifier model 3 on training set: 0.71


In [25]:
print('Model 1 performs the best with an accuracy of', model1_acc)

Model 1 performs the best with an accuracy of 0.77


#### Create a new dataframe with top 4 features.

In [27]:
# importing data
four_feat_df = df[['pclass','fare', 'age', 'categorical_sex']]

four_feat_df

Unnamed: 0,pclass,fare,age,categorical_sex
0,3,7.2500,22.0,1
1,1,71.2833,38.0,0
2,3,7.9250,26.0,0
3,1,53.1000,35.0,0
4,3,8.0500,35.0,1
...,...,...,...,...
886,2,13.0000,27.0,1
887,1,30.0000,19.0,0
888,3,23.4500,,0
889,1,30.0000,26.0,1


#### Use the top performing algorithm with the metaparameters used in that model. Create the object, fit, transform on in-sample data, and evaluate the results with the training data. Compare your evaluation metrics with those from the original model (with all the features).

In [44]:
# importing data
df = prep_titanic(new_get_titanic_data())

# declaring which columns will be included in X and y
X = df.drop(columns = ['survived','sex','embarked'])

y = df[['survived']]

# creating test, validate and train DFs 
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

In [46]:
# going to handle empty values in age by imputing with the most common age

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [53]:
# Creating / fitting model 1 to new data frame 

# from sklearn.neighbors import KNeighborsClassifier
knn1_new = KNeighborsClassifier(n_neighbors=5, weights='uniform')

# fitting model to train sample
knn1_new.fit(X_train, y_train)

# using model to make predictions of survival using X_train data
y_pred1 = knn1_new.predict(X_train)

# saving new model's accuracy
knn1_new_acc = round(knn1_new.score(X_train, y_train),2)

# printing accuracy of new model 1
print('Accuracy of KNN classifier model 1 on training set:', knn1_new_acc)

Accuracy of KNN classifier model 1 on training set: 0.76


In [59]:
# printing accuracy of model with top 4 features
print('accuracy of model 1 using top 4 features:', model1_acc)

# printing accuracy of model with all features and hyperparameters of previous model
print('accuracy of model using model 1\'s hyperparameters and all features:', knn1_new_acc)

accuracy of model 1 using top 4 features: 0.77
accuracy of model using model 1's hyperparameters and all features: 0.76


#### Run your final model on your out-of-sample dataframe (test_df). Evaluate the results.

In [65]:
# calculating accuracy of final model on test data
final_model_acc = round(knn1_new.score(X_test, y_test),2)

# displaying results
print("Final model's accuracy on test data", final_model_acc)

Final model's accuracy on test data 0.66


# Feature Engineering

### Titanic Data

#### Create a feature named who, this should be either man, woman, or child. How does including this feature affect your model's performance?

In [174]:
# importing data
df = prep_titanic(new_get_titanic_data())
X = df.drop(columns = ['survived', 'embarked'])
y = df[['survived']]

# creating test, validate and train DFs 
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

In [175]:
# going to handle empty values in age by imputing with the most common age

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all train DF with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])

In [176]:
# creating new column 'who' that holds a number value based on men, women, and children
X_train['who'] = X_train['age']
X_train.loc[((X_train.age > 17) & (X_train.sex == 'male')),'who'] = 1
X_train.loc[((X_train.age > 17) & (X_train.sex == 'female')),'who'] = 2
X_train.loc[(X_train.age <= 17),'who'] = 3

In [177]:
# kept losing my who column if i dropped sex column so i'm creating a new dataframe with specified columns
# as work around
new_X_train = X_train[['pclass','age','sibsp','parch','fare','embark_town','alone','categorical_sex','who']]

In [178]:
# Creating model from new data frame 

# from sklearn.neighbors import KNeighborsClassifier
knn_who = KNeighborsClassifier(n_neighbors=5, weights='uniform')

# fitting model to train sample
knn_who.fit(new_X_train, y_train)

# using model to make predictions of survival using train data
y_pred_who = knn_who.predict(new_X_train)

# saving new model's accuracy
knn_who_acc = round(knn_who.score(new_X_train, y_train),2)

# printing accuracy of new model 1
print('Accuracy of KNN classifier with who column on training set:', knn_who_acc)

Accuracy of KNN classifier with who column on training set: 0.76


#### Create a feature named adult_male that is either a 1 or a 0. How does this affect your model's predictions?

In [179]:
# creating new column 'adult_male' that holds a 1 if person is adult male, 0 otherwise
new_X_train['adult_male'] = 0
new_X_train.loc[(new_X_train.who == 1,'adult_male')] = 1

In [180]:
# Creating model from new data frame 

# from sklearn.neighbors import KNeighborsClassifier
knn_AM = KNeighborsClassifier(n_neighbors=5, weights='uniform')

# fitting model to train sample
knn_AM.fit(new_X_train, y_train)

# using model to make predictions of survival using train data
y_pred_who = knn_AM.predict(new_X_train)

# saving new model's accuracy
knn_AM_acc = round(knn_AM.score(new_X_train, y_train),2)

# printing accuracy of new model 1
print('Accuracy of KNN classifier with adult_male column on training set:', knn_AM_acc)

Accuracy of KNN classifier with adult_male column on training set: 0.77


### Iris Data

#### Create features named petal_area and sepal_area.