# LOGISTIC REGRESSION

### In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [5]:
# importing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from acquire import new_get_titanic_data
from titanic_model_setup import prep_titanic
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


# using functions from prep and aqcuire to create titanic dataframe
# PLEASE NOTE: a new prep file was created for this exercise in order to avoid altering the
# contents of the original file (as their requirements differ slighltly)
# new file is named 'titanic_model_setup'
df = prep_titanic(new_get_titanic_data())

# display first 5 rows
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,categorical_sex
0,0,3,male,22.0,1,0,7.25,S,2,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,2,1,0
3,1,1,female,35.0,1,0,53.1,S,2,0,0
4,0,3,male,35.0,0,0,8.05,S,2,1,1


In [105]:
# declaring which columns will be included in X and y
X = df[['pclass','fare']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 2) , X_validate:  (214, 2) , X_test:  (179, 2)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [106]:
# going to handle empty values in age by imputing with the most common age

df.dropna(inplace = True)

### Start by defining your baseline model

In [107]:
# checking with class to use for baseline
# 0 = did not survive | 1 = survived
# since there are less survivors than deceased, not survived will be baseline
y_train.survived.value_counts()

0    302
1    196
Name: survived, dtype: int64

In [108]:
# creating dataframe using y_train (survived) data
models = pd.DataFrame(y_train)
# adding column that will hold baseline values (0)
models['baseline'] = 0
# renaming first column
models.columns = ['actual','baseline']
# displaying results
models.head()

Unnamed: 0,actual,baseline
689,1,0
84,1,0
738,0,0
441,0,0
643,1,0


In [109]:
# calculating baseline accuracy via creating boolean mask that holds true for rows where the 
# baseline matches the actual value
# mean tells use the % of rows where this occured, ie. % of rows where baseline model was correct
baseline_accuracy = (models.baseline == models.actual).mean()

# printing results, baseline had accuracy of 61%
print(f'The baseline model is {round(baseline_accuracy,2) * 100}% accurate')


The baseline model is 61.0% accurate


#### Create another model that includes age in addition to fare and pclass. 

In [147]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [148]:
# declaring which columns will be included in X and y
# adding age this time
X = df[['pclass','fare', 'age']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 3) , X_validate:  (214, 3) , X_test:  (179, 3)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [149]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [150]:
# creating logistic regression object
logit_1 = LogisticRegression()

In [151]:
# fitting model to training data
logit_1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [152]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit_1.coef_)
print('Intercept: \n', logit_1.intercept_)

Coefficient: 
 [[-0.94438149  0.00314466 -0.02819317]]
Intercept: 
 [2.41877072]


In [153]:
# estimate whether or not a person would survive based on the training data
y_pred = logit_1.predict(X_train)

In [154]:
# estimate p of person surviving
y_pred_proba = logit_1.predict_proba(X_train)

In [155]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_1.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.71


In [156]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[259  43]
 [101  95]]


In [157]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78       302
           1       0.69      0.48      0.57       196

    accuracy                           0.71       498
   macro avg       0.70      0.67      0.68       498
weighted avg       0.71      0.71      0.70       498



#### Does this model perform better than your previous one?

In [158]:
# Answer: Yes, this model has an accuracy of 71%, this is 10% higher than the baseline model

#### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [159]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [160]:
# declaring which columns will be included in X and y
# adding age and sex this time
X = df[['pclass','fare', 'age', 'categorical_sex']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 4) , X_validate:  (214, 4) , X_test:  (179, 4)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [161]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [162]:
# creating logistic regression object
logit_2 = LogisticRegression()

In [163]:
# fitting model to training data
logit_2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [164]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit_2.coef_)
print('Intercept: \n', logit_2.intercept_)

Coefficient: 
 [[-1.12616011e+00  1.32701409e-04 -2.12849959e-02 -2.39973517e+00]]
Intercept: 
 [4.18183443]


In [165]:
# estimate whether or not a person would survive based on the training data
y_pred = logit_2.predict(X_train)

In [166]:
# estimate p of person surviving
y_pred_proba = logit_2.predict_proba(X_train)

In [167]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_2.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.78


In [168]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[246  56]
 [ 53 143]]


In [169]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.82       302
           1       0.72      0.73      0.72       196

    accuracy                           0.78       498
   macro avg       0.77      0.77      0.77       498
weighted avg       0.78      0.78      0.78       498



#### Try out other combinations of features and models.

In [188]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [189]:
# declaring which columns will be included in X and y
# adding age, sex and whether the passenger was alone this time
X = df[['pclass','fare', 'age', 'categorical_sex', 'alone']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 5) , X_validate:  (214, 5) , X_test:  (179, 5)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [190]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [191]:
# creating 3 different logistic regression objects with different solver arguments
# Note: models 4 and 5 will be used for upcoming question # 5
logit_3 = LogisticRegression(solver = 'lbfgs')
logit_4 = LogisticRegression(solver = 'liblinear')
logit_5 = LogisticRegression(solver = 'newton-cg')

In [192]:
# fitting models to training data 
# Note: models 4 and 5 will be used for upcoming question # 5
logit_3.fit(X_train, y_train)
logit_4.fit(X_train, y_train)
logit_5.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [193]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit_3.coef_)
print('Intercept: \n', logit_3.intercept_)

Coefficient: 
 [[-1.12726009e+00 -3.21796518e-04 -2.03922298e-02 -2.35677691e+00
  -2.08399581e-01]]
Intercept: 
 [4.26875038]


In [200]:
# estimate whether or not a person would survive based on the training data
y_pred = logit_3.predict(X_train)

In [201]:
# estimate p of person surviving
y_pred_proba = logit_3.predict_proba(X_train)

In [202]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_3.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.79


In [203]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[251  51]
 [ 56 140]]


In [198]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       302
           1       0.73      0.71      0.72       196

    accuracy                           0.79       498
   macro avg       0.78      0.77      0.77       498
weighted avg       0.78      0.79      0.78       498



#### Use you best 3 models to predict and evaluate on your validate sample.

In [199]:
# evaluating models on sample data
y_pred_3 = logit_3.predict(X_validate)
y_pred_4 = logit_4.predict(X_validate)
y_pred_5 = logit_5.predict(X_validate)

# printing accuracy, confusion matrix, and classification report for model 3
print("Model 3:")
print('Accuracy: {:.2f}'.format(logit_3.score(X_validate, y_validate)))
print(confusion_matrix(y_validate, y_pred_3))
print(classification_report(y_validate, y_pred_3))

# printing accuracy, confusion matrix, and classification report for model 4
print("Model 4:")
print('Accuracy: {:.2f}'.format(logit_4.score(X_validate, y_validate)))
print(confusion_matrix(y_validate, y_pred_4))
print(classification_report(y_validate, y_pred_4))

# printing accuracy, confusion matrix, and classification report for model 5
print("Model 5:")
print('Accuracy: {:.2f}'.format(logit_5.score(X_validate, y_validate)))
print(confusion_matrix(y_validate, y_pred_5))
print(classification_report(y_validate, y_pred_5))

Model 3:
Accuracy: 0.80
[[114  19]
 [ 23  58]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       133
           1       0.75      0.72      0.73        81

    accuracy                           0.80       214
   macro avg       0.79      0.79      0.79       214
weighted avg       0.80      0.80      0.80       214

Model 4:
Accuracy: 0.81
[[115  18]
 [ 23  58]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       133
           1       0.76      0.72      0.74        81

    accuracy                           0.81       214
   macro avg       0.80      0.79      0.79       214
weighted avg       0.81      0.81      0.81       214

Model 5:
Accuracy: 0.80
[[114  19]
 [ 23  58]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       133
           1       0.75      0.72      0.73        81

    accuracy                           0.

In [None]:
# Answer: Model 4 is ~1% more accurate than models 3 and 5

#### Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

In [204]:
# evaluating model 4 on train, validate and test data
y_pred_4_train = logit_4.predict(X_train)
y_pred_4_val = logit_4.predict(X_validate)
y_pred_4_test = logit_4.predict(X_test)

In [210]:
# printing classification report for model 4 vs train data 
print('Classification report of Model 4 applied to train data\n')
print(classification_report(y_train, y_pred_4_train))

Classification report of Model 4 applied to train data

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       302
           1       0.72      0.68      0.70       196

    accuracy                           0.77       498
   macro avg       0.76      0.75      0.76       498
weighted avg       0.77      0.77      0.77       498



In [209]:
# printing classification report for model 4 vs validate data 
print('Classification report of Model 4 applied to validate data\n')
print(classification_report(y_validate, y_pred_4_val))

Classification report of Model 4 applied to validate data

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       133
           1       0.76      0.72      0.74        81

    accuracy                           0.81       214
   macro avg       0.80      0.79      0.79       214
weighted avg       0.81      0.81      0.81       214



In [211]:
# printing classification report for model 4 vs test data 
print('Classification report of Model 4 applied to test data\n')
print(classification_report(y_test, y_pred_4_test))

Classification report of Model 4 applied to test data

              precision    recall  f1-score   support

           0       0.84      0.85      0.84       114
           1       0.73      0.71      0.72        65

    accuracy                           0.80       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



In [1]:
# Answer: Model 4 showcased higher accuracy when applied to the validate and test data than it did when
# it was applied to the train data. Overall it had the highest accuracy on the validate data set. 

# DECISION TREE

#### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [122]:
# importing data
df = prep_titanic(new_get_titanic_data())

# declaring which columns will be included in X and y
# adding age, sex and whether the passenger was alone this time
X = df[['fare','categorical_sex', 'pclass']]
y = df[['survived']]

# creating test, validate and train DFs
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 3) , X_validate:  (214, 3) , X_test:  (179, 3)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [123]:
# importing decision tree function
from sklearn.tree import DecisionTreeClassifier

# setting depth and state for function
clf = DecisionTreeClassifier(max_depth = 1, random_state = 123)

In [124]:
# fitting model to data
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [125]:
# making predictions on whether passengers survived
y_pred_1 = clf.predict(X_train)
y_pred_1[0:5]

array([1, 1, 0, 0, 0])

In [126]:
# probabilities of survival
y_pred_1_proba = clf.predict_proba(X_train)
y_pred_1_proba[0:5]

array([[0.26404494, 0.73595506],
       [0.26404494, 0.73595506],
       [0.796875  , 0.203125  ],
       [0.796875  , 0.203125  ],
       [0.796875  , 0.203125  ]])

#### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [127]:
# printing accuracy of model 1
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.78


In [128]:
# creating confusion matrix
confusion_matrix(y_train, y_pred_1)

array([[255,  47],
       [ 65, 131]])

In [129]:
# printing classification report
print(classification_report(y_train, y_pred_1))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82       302
           1       0.74      0.67      0.70       196

    accuracy                           0.78       498
   macro avg       0.77      0.76      0.76       498
weighted avg       0.77      0.78      0.77       498



#### Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [149]:
# setting variables for true pos, true negative, false pos, false neg
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_1).ravel()

In [176]:
# calculating accuracy
accuracy = (tp + tn) / (tp + tn + fp + fn)

# displaying results
print('Accuracy of model 1 is',accuracy) # overall accuracy, positive and negative predictions

Accuracy of model 1 is 0.7751004016064257


In [175]:
# calculating recall
recall = tp / (tp + fn)

# displaying results
print('Recall aka true positive rate of model 1 is',recall) # aka true positive rate aka sensitivity

Recall aka true positive rate of model 1 is 0.6683673469387755


In [178]:
# calculating specificity
specificity = tn / (tn + fp) 

# displaying results
print('True negative rate of model 1 is',specificity) # aka true negative rate

True negative rate of model 1 is 0.8443708609271523


In [179]:
from sklearn.metrics import f1_score

# calculating f score
f1 = f1_score(y_train, y_pred_1)

# displaying results
print('F1 score of model 1 is',f1) # the mean of precision and recall

F1 score of model 1 is 0.7005347593582888


In [180]:
# calculating precision
precision = tp / (tp + fp)

# displaying results
print('precision of model 1 is',precision)

precision of model 1 is 0.7359550561797753


In [181]:
# calculating false positive rate
fallout = fp / (fp + tn)

# displaying results
print('False positive rate of model 1 is',fallout) # false positive rate 

False positive rate of model 1 is 0.15562913907284767


In [182]:
# calculating false negative rate
miss_rate = fn / (fn + tp)

# displaying results
print('False negative rate of model 1 is',miss_rate) # false negative rate

False negative rate of model 1 is 0.33163265306122447


In [184]:
from sklearn.metrics import precision_recall_fscore_support

# calculating support in addition to other values
p,r,f,support = precision_recall_fscore_support(y_train, y_pred_1)

# displaying support
print('support values of model 1 are',support)

support values of model 1 are [302 196]


#### Run through steps 2-4 using a different max_depth value.

In [130]:
# setting depth and state for function (increasing depth to 3)
clf = DecisionTreeClassifier(max_depth = 3, random_state = 123)

In [131]:
# fitting model to data
clf.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')

In [132]:
# predicting whether each passenger survives or not
y_pred_2 = clf.predict(X_train)
y_pred_2[0:5]

array([1, 1, 0, 0, 0])

In [133]:
# probability of survival for each passenger
y_pred_2_proba = clf.predict_proba(X_train)
y_pred_2_proba[0:5]

array([[0.01587302, 0.98412698],
       [0.1       , 0.9       ],
       [0.93684211, 0.06315789],
       [0.81699346, 0.18300654],
       [0.81699346, 0.18300654]])

In [134]:
# printing accuracy of model 2
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.79


In [135]:
# creating confusion matrix
confusion_matrix(y_train, y_pred_2)

array([[264,  38],
       [ 67, 129]])

In [136]:
# printing classification report
print(classification_report(y_train, y_pred_2))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       302
           1       0.77      0.66      0.71       196

    accuracy                           0.79       498
   macro avg       0.79      0.77      0.77       498
weighted avg       0.79      0.79      0.79       498



#### Which performs better on your in-sample data?