# LOGISTIC REGRESSION

### In this exercise, we'll continue working with the titanic dataset and building logistic regression models. Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. The test dataset should only be used for your final model.

In [104]:
# importing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from acquire import new_get_titanic_data
from titanic_model_setup import prep_titanic

# ignore warnings
import warnings
warnings.filterwarnings("ignore")


# using functions from prep and aqcuire to create titanic dataframe
# PLEASE NOTE: a new prep file was created for this exercise in order to avoid altering the
# contents of the original file (as their requirements differ slighltly)
# new file is named 'titanic_model_setup'
df = prep_titanic(new_get_titanic_data())

# display first 5 rows
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,embark_town,alone,categorical_sex
0,0,3,male,22.0,1,0,7.25,S,2,0,1
1,1,1,female,38.0,1,0,71.2833,C,0,0,0
2,1,3,female,26.0,0,0,7.925,S,2,1,0
3,1,1,female,35.0,1,0,53.1,S,2,0,0
4,0,3,male,35.0,0,0,8.05,S,2,1,1


In [105]:
# declaring which columns will be included in X and y
X = df[['pclass','fare']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 2) , X_validate:  (214, 2) , X_test:  (179, 2)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [106]:
# going to handle empty values in age by imputing with the most common age

df.dropna(inplace = True)

### Start by defining your baseline model

In [107]:
# checking with class to use for baseline
# 0 = did not survive | 1 = survived
# since there are less survivors than deceased, not survived will be baseline
y_train.survived.value_counts()

0    302
1    196
Name: survived, dtype: int64

In [108]:
# creating dataframe using y_train (survived) data
models = pd.DataFrame(y_train)
# adding column that will hold baseline values (0)
models['baseline'] = 0
# renaming first column
models.columns = ['actual','baseline']
# displaying results
models.head()

Unnamed: 0,actual,baseline
689,1,0
84,1,0
738,0,0
441,0,0
643,1,0


In [109]:
# calculating baseline accuracy via creating boolean mask that holds true for rows where the 
# baseline matches the actual value
# mean tells use the % of rows where this occured, ie. % of rows where baseline model was correct
baseline_accuracy = (models.baseline == models.actual).mean()

# printing results, baseline had accuracy of 61%
print(f'The baseline model is {round(baseline_accuracy,2) * 100}% accurate')


The baseline model is 61.0% accurate


#### Create another model that includes age in addition to fare and pclass. 

In [110]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [111]:
# declaring which columns will be included in X and y
# adding age this time
X = df[['pclass','fare', 'age']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 3) , X_validate:  (214, 3) , X_test:  (179, 3)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [112]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [113]:
# creating logistic regression object
logit = LogisticRegression()

In [114]:
# fitting model to training data
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [115]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.94438149  0.00314466 -0.02819317]]
Intercept: 
 [2.41877072]


In [116]:
# estimate whether or not a person would survive based on the training data
y_pred = logit.predict(X_train)

In [117]:
# estimate p of person surviving
y_pred_proba = logit.predict_proba(X_train)

In [118]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.71


In [119]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[259  43]
 [101  95]]


In [120]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78       302
           1       0.69      0.48      0.57       196

    accuracy                           0.71       498
   macro avg       0.70      0.67      0.68       498
weighted avg       0.71      0.71      0.70       498



#### Does this model perform better than your previous one?

In [None]:
# Answer: Yes, this model has an accuracy of 71%, this is 10% higher than the baseline model

#### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [122]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [124]:
# declaring which columns will be included in X and y
# adding age and sex this time
X = df[['pclass','fare', 'age', 'categorical_sex']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 4) , X_validate:  (214, 4) , X_test:  (179, 4)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [125]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [126]:
# creating logistic regression object
logit = LogisticRegression()

In [127]:
# fitting model to training data
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [128]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.12616011e+00  1.32701409e-04 -2.12849959e-02 -2.39973517e+00]]
Intercept: 
 [4.18183443]


In [129]:
# estimate whether or not a person would survive based on the training data
y_pred = logit.predict(X_train)

In [130]:
# estimate p of person surviving
y_pred_proba = logit.predict_proba(X_train)

In [131]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.78


In [132]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[246  56]
 [ 53 143]]


In [133]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.81      0.82       302
           1       0.72      0.73      0.72       196

    accuracy                           0.78       498
   macro avg       0.77      0.77      0.77       498
weighted avg       0.78      0.78      0.78       498



#### Try out other combinations of features and models.

In [136]:
# importing data
df = prep_titanic(new_get_titanic_data())

In [137]:
# declaring which columns will be included in X and y
# adding age, sex and whether the passenger was alone this time
X = df[['pclass','fare', 'age', 'categorical_sex', 'alone']]
y = df[['survived']]

# creating test, validate and train DFs using curriculum specifications
X_train_validate, X_test, y_train_validate, y_test = train_test_split(X, y, test_size = .20, random_state = 123)

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate, test_size = .30, random_state = 123)

# printing results
print("X_train: ", X_train.shape, ", X_validate: ", X_validate.shape, ", X_test: ", X_test.shape)
print("y_train: ", y_train.shape, ", y_validate: ", y_validate.shape, ", y_test: ", y_test.shape)

X_train:  (498, 5) , X_validate:  (214, 5) , X_test:  (179, 5)
y_train:  (498, 1) , y_validate:  (214, 1) , y_test:  (179, 1)


In [138]:
# going to handle empty values in age by imputing with the most common age

# importing imputer
from sklearn.impute import SimpleImputer

# setting imputer startegy to imput most frequent value
imputer = SimpleImputer(strategy = 'most_frequent')

# transforming all 3 DFs with the fit from train
X_train[['age']] = imputer.fit_transform(X_train[['age']])
X_validate[['age']] = imputer.transform(X_validate[['age']])
X_test[['age']] = imputer.transform(X_test[['age']])

In [139]:
# creating logistic regression object
logit = LogisticRegression()

In [140]:
# fitting model to training data
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [141]:
# printing model coefficients and intercept 
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-1.12726009e+00 -3.21796518e-04 -2.03922298e-02 -2.35677691e+00
  -2.08399581e-01]]
Intercept: 
 [4.26875038]


In [142]:
# estimate whether or not a person would survive based on the training data
y_pred = logit.predict(X_train)

In [143]:
# estimate p of person surviving
y_pred_proba = logit.predict_proba(X_train)

In [144]:
# calculate accuracy of model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.79


In [145]:
# create confusion matrix
print(confusion_matrix(y_train, y_pred))

[[251  51]
 [ 56 140]]


In [146]:
# show classification report to see details of model performance
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       302
           1       0.73      0.71      0.72       196

    accuracy                           0.79       498
   macro avg       0.78      0.77      0.77       498
weighted avg       0.78      0.79      0.78       498



#### Use you best 3 models to predict and evaluate on your validate sample.

In [None]:
Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?