# Logistic Regression

In [1076]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from prepare import prep_titanic, theometrics

### 1)
   Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [1077]:
#calling the prep_titanic function from prepare.py to split my set into train, validate and test

train, val, test = prep_titanic()
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,0,19.0,1,0,53.1,0,True,True,False,False,False,False,True
45,0,28.0,0,0,8.05,1,True,False,False,True,False,False,True
28,1,28.0,0,0,7.8792,1,False,False,False,True,False,True,False
633,0,28.0,0,0,0.0,1,True,True,False,False,False,False,True
403,0,28.0,1,0,15.85,0,True,False,False,True,False,False,True


In [1078]:
#defining my train and val subsets and only including age, fare, and pclass

X_train = train.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 'sex_male', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train = train.survived

X_val = val.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 'sex_male', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_val = val.survived

In [1079]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['age', 'fare']] = mms.fit_transform(X_train[['age', 'fare']])
X_val[['age', 'fare']] = mms.transform(X_val[['age', 'fare']])

X_train.head()

Unnamed: 0,age,fare,class_First,class_Second,class_Third
748,0.233476,0.103644,True,False,False
45,0.346569,0.015713,False,False,True
28,0.346569,0.015379,False,False,True
633,0.346569,0.0,True,False,False
403,0.346569,0.030937,False,False,True


In [1080]:
#baseline 

(y_train == 0).mean()

0.6163723916532905

In [1081]:
#building the LogisticRegression model

seed = 42

logreg = LogisticRegression(random_state = seed, max_iter = 300, solver = 'liblinear', penalty = 'l1')

In [1082]:
#fitting the model on the train subsets

logreg.fit(X_train, y_train)

In [1083]:
pd.DataFrame({'feature': X_train.columns,
              'coefficient': logreg.coef_[0]})

Unnamed: 0,feature,coefficient
0,age,-2.617549
1,fare,2.046834
2,class_First,1.457
3,class_Second,0.643659
4,class_Third,-0.329663


### model score

In [1084]:
#testing the performance of the train subset

logreg.score(X_train, y_train)

0.709470304975923

In [1085]:
#testing the performance of the val subset

logreg.score(X_val, y_val)

0.7089552238805971

In [1086]:
#generating the x_train and x_val predictions and assigning it to y_pred and v_pred

y_pred = logreg.predict(X_train)
v_pred = logreg.predict(X_val)

### confusion matrix

In [1087]:
confusion_matrix(y_train, y_pred)

array([[333,  51],
       [130, 109]])

### classification report

In [1088]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79       384
           1       0.68      0.46      0.55       239

    accuracy                           0.71       623
   macro avg       0.70      0.66      0.67       623
weighted avg       0.70      0.71      0.69       623



### train metrics

In [1089]:
theometrics(333, 109, 130, 51)

Unnamed: 0,metric,value
0,Accuracy,0.70947
1,Recall,0.867188
2,True Positive Rate,0.867188
3,False Positive Rate,0.543933
4,True Negative Rate,0.456067
5,False Negative Rate,0.132812
6,Precision,0.719222
7,F1-Score,0.786305
8,Support,384.0


### val metrics

In [1090]:
vmodel1_cm = confusion_matrix(y_val, v_pred)
vmodel1_cm

array([[71, 11],
       [28, 24]])

In [1091]:
vmodel1 = theometrics(71, 24, 28, 11)
vmodel1

Unnamed: 0,metric,value
0,Accuracy,0.708955
1,Recall,0.865854
2,True Positive Rate,0.865854
3,False Positive Rate,0.538462
4,True Negative Rate,0.461538
5,False Negative Rate,0.134146
6,Precision,0.717172
7,F1-Score,0.78453
8,Support,82.0


### validate model

In [1092]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(logreg.score(X_val, y_val)))

Accuracy of KNN classifier on test set: 0.71


The model performs a little better than the baseline.

### 2)
   Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [1093]:
#defining my train and val subsets and only including age, fare, and pclass

X_train = train.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train = train.survived

X_val = val.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_val = val.survived

X_test = test.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_test = test.survived

In [1094]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['age', 'fare']] = mms.fit_transform(X_train[['age', 'fare']])
X_val[['age', 'fare']] = mms.transform(X_val[['age', 'fare']])
X_test[['age', 'fare']] = mms.transform(X_test[['age', 'fare']])

X_train.head()

Unnamed: 0,age,fare,sex_male,class_First,class_Second,class_Third
748,0.233476,0.103644,True,True,False,False
45,0.346569,0.015713,True,False,False,True
28,0.346569,0.015379,False,False,False,True
633,0.346569,0.0,True,True,False,False
403,0.346569,0.030937,True,False,False,True


In [1095]:
#baseline 

(y_train == 0).mean()

0.6163723916532905

In [1096]:
#building the LogisticRegression model

seed = 42

logreg = LogisticRegression(random_state = seed, max_iter = 300, solver = 'liblinear', penalty = 'l1')

In [1097]:
#fitting the model on the train subsets

logreg.fit(X_train, y_train)

In [1098]:
pd.DataFrame({'feature': X_train.columns,
              'coefficient': logreg.coef_[0]})

Unnamed: 0,feature,coefficient
0,age,-2.288738
1,fare,0.0
2,sex_male,-2.54352
3,class_First,2.142358
4,class_Second,0.999868
5,class_Third,0.0


In [1099]:
#testing the performance of the train subset

logreg.score(X_train, y_train)

0.8025682182985554

In [1100]:
#testing the performance of the val subset

logreg.score(X_val, y_val)

0.8208955223880597

In [1101]:
#generating the x_train and x_val predictions and assigning it to y_pred and v_pred

y_pred = logreg.predict(X_train)
v_pred = logreg.predict(X_val)
t_pred = logreg.predict(X_test)

### confusion matrix

In [1102]:
confusion_matrix(y_train, y_pred)

array([[333,  51],
       [ 72, 167]])

### classification report

In [1103]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       384
           1       0.77      0.70      0.73       239

    accuracy                           0.80       623
   macro avg       0.79      0.78      0.79       623
weighted avg       0.80      0.80      0.80       623



### train metrics

In [1104]:
theometrics(333, 167, 72, 51)

Unnamed: 0,metric,value
0,Accuracy,0.802568
1,Recall,0.867188
2,True Positive Rate,0.867188
3,False Positive Rate,0.301255
4,True Negative Rate,0.698745
5,False Negative Rate,0.132812
6,Precision,0.822222
7,F1-Score,0.844106
8,Support,384.0


### val metrics

In [1105]:
vmodel2_cm = confusion_matrix(y_val, v_pred)
vmodel2_cm

array([[70, 12],
       [12, 40]])

In [1106]:
vmodel2 = theometrics(70, 40, 12, 12)
vmodel2

Unnamed: 0,metric,value
0,Accuracy,0.820896
1,Recall,0.853659
2,True Positive Rate,0.853659
3,False Positive Rate,0.230769
4,True Negative Rate,0.769231
5,False Negative Rate,0.146341
6,Precision,0.853659
7,F1-Score,0.853659
8,Support,82.0


### validate model

In [1107]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(logreg.score(X_val, y_val)))

Accuracy of KNN classifier on test set: 0.82


In [1108]:
testmodel_3 = print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))
testmodel_3

Accuracy of KNN classifier on test set: 0.74


The model with sex included performs 10% better than the first model.

### 3) a)
   Try out other combinations of features and models.

Trying out a model with age, fare, pclass, sex and embark_town_Queenstown

Also setting penalty='l2' and solver='newton-cg'

In [1109]:
#defining my train and val subsets and only including age, fare, and pclass

X_train = train.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Southampton'])
y_train = train.survived

X_val = val.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Southampton'])
y_val = val.survived

In [1110]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['age', 'fare']] = mms.fit_transform(X_train[['age', 'fare']])
X_val[['age', 'fare']] = mms.transform(X_val[['age', 'fare']])

X_train.head()

Unnamed: 0,age,fare,sex_male,class_First,class_Second,class_Third,embark_town_Queenstown
748,0.233476,0.103644,True,True,False,False,False
45,0.346569,0.015713,True,False,False,True,False
28,0.346569,0.015379,False,False,False,True,True
633,0.346569,0.0,True,True,False,False,False
403,0.346569,0.030937,True,False,False,True,False


In [1111]:
#baseline 

(y_train == 0).mean()

0.6163723916532905

In [1112]:
#building the LogisticRegression model

seed = 42

logreg = LogisticRegression(random_state = seed, max_iter = 300, solver = 'newton-cg', penalty = 'l2')

In [1113]:
#fitting the model on the train subsets

logreg.fit(X_train, y_train)

In [1114]:
pd.DataFrame({'feature': X_train.columns,
              'coefficient': logreg.coef_[0]})

Unnamed: 0,feature,coefficient
0,age,-2.006046
1,fare,0.449088
2,sex_male,-2.459091
3,class_First,1.040995
4,class_Second,0.028451
5,class_Third,-1.069446
6,embark_town_Queenstown,0.51957


In [1115]:
#testing the performance of the train subset

logreg.score(X_train, y_train)

0.8009630818619583

In [1116]:
#testing the performance of the val subset

logreg.score(X_val, y_val)

0.8134328358208955

In [1117]:
#generating the x_train and x_val predictions and assigning it to y_pred and v_pred

y_pred = logreg.predict(X_train)
v_pred = logreg.predict(X_val)

### confusion matrix

In [1118]:
confusion_matrix(y_train, y_pred)

array([[333,  51],
       [ 73, 166]])

### classification report

In [1119]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       384
           1       0.76      0.69      0.73       239

    accuracy                           0.80       623
   macro avg       0.79      0.78      0.79       623
weighted avg       0.80      0.80      0.80       623



### train metrics

In [1120]:
theometrics(333, 166, 73, 51)

Unnamed: 0,metric,value
0,Accuracy,0.800963
1,Recall,0.867188
2,True Positive Rate,0.867188
3,False Positive Rate,0.305439
4,True Negative Rate,0.694561
5,False Negative Rate,0.132812
6,Precision,0.820197
7,F1-Score,0.843038
8,Support,384.0


### val metrics

In [1121]:
vmodel3_cm = confusion_matrix(y_val, v_pred)
vmodel3_cm

array([[69, 13],
       [12, 40]])

In [1122]:
vmodel3 = theometrics(69, 40, 12, 13)
vmodel3

Unnamed: 0,metric,value
0,Accuracy,0.813433
1,Recall,0.841463
2,True Positive Rate,0.841463
3,False Positive Rate,0.230769
4,True Negative Rate,0.769231
5,False Negative Rate,0.158537
6,Precision,0.851852
7,F1-Score,0.846626
8,Support,82.0


### b) 

Trying out a model with age, fare, pclass, sex and embark_town_Southampton

Also setting max_iter=400, penalty='l2' and solver='lbfgs'

In [1123]:
#defining my train and val subsets and only including age, fare, and pclass

X_train = train.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown'])
y_train = train.survived

X_val = val.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown'])
y_val = val.survived

In [1124]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['age', 'fare']] = mms.fit_transform(X_train[['age', 'fare']])
X_val[['age', 'fare']] = mms.transform(X_val[['age', 'fare']])

X_train.head()

Unnamed: 0,age,fare,sex_male,class_First,class_Second,class_Third,embark_town_Southampton
748,0.233476,0.103644,True,True,False,False,True
45,0.346569,0.015713,True,False,False,True,True
28,0.346569,0.015379,False,False,False,True,False
633,0.346569,0.0,True,True,False,False,True
403,0.346569,0.030937,True,False,False,True,True


In [1125]:
#baseline 

(y_train == 0).mean()

0.6163723916532905

In [1126]:
#building the LogisticRegression model

seed = 42

logreg = LogisticRegression(random_state = seed, max_iter = 400, solver = 'newton-cg', penalty = 'l2')

In [1127]:
#fitting the model on the train subsets

logreg.fit(X_train, y_train)

In [1128]:
pd.DataFrame({'feature': X_train.columns,
              'coefficient': logreg.coef_[0]})

Unnamed: 0,feature,coefficient
0,age,-1.959281
1,fare,0.321621
2,sex_male,-2.451129
3,class_First,0.945329
4,class_Second,0.093274
5,class_Third,-1.038603
6,embark_town_Southampton,-0.50302


In [1129]:
#testing the performance of the train subset

logreg.score(X_train, y_train)

0.8025682182985554

In [1130]:
#testing the performance of the val subset

logreg.score(X_val, y_val)

0.8059701492537313

In [1131]:
#generating the x_train and x_val predictions and assigning it to y_pred and v_pred

y_pred = logreg.predict(X_train)
v_pred = logreg.predict(X_val)

### confusion matrix

In [1132]:
confusion_matrix(y_train, y_pred)

array([[329,  55],
       [ 68, 171]])

### classification report

In [1133]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       384
           1       0.76      0.72      0.74       239

    accuracy                           0.80       623
   macro avg       0.79      0.79      0.79       623
weighted avg       0.80      0.80      0.80       623



### train metrics

In [1134]:
theometrics(329, 171, 68, 55)

Unnamed: 0,metric,value
0,Accuracy,0.802568
1,Recall,0.856771
2,True Positive Rate,0.856771
3,False Positive Rate,0.284519
4,True Negative Rate,0.715481
5,False Negative Rate,0.143229
6,Precision,0.828715
7,F1-Score,0.84251
8,Support,384.0


In [1135]:
vmodel4_cm = confusion_matrix(y_val, v_pred)
vmodel4_cm

array([[69, 13],
       [13, 39]])

### val metrics

In [1136]:
vmodel4 = theometrics(69, 39, 13, 13)
vmodel4

Unnamed: 0,metric,value
0,Accuracy,0.80597
1,Recall,0.841463
2,True Positive Rate,0.841463
3,False Positive Rate,0.25
4,True Negative Rate,0.75
5,False Negative Rate,0.158537
6,Precision,0.841463
7,F1-Score,0.841463
8,Support,82.0


### Validate the model

4) Use your best 3 models to predict and evaluate on your validate sample.

Model 2, 3 and 4 are my best performing models

In [1137]:
vmodel2['model_3'] = round(vmodel3.value * 100)
vmodel2['model_4'] = round(vmodel4.value * 100)
vmodel2 = vmodel2.rename(columns = {'value': 'model_2'})
vmodel2 = vmodel2.rename(columns = {'metric': 'OUT-OF-SAMPLE____metric'})
vmodel2.model_2 = round(vmodel2.model_2 * 100)
vmodel2

Unnamed: 0,OUT-OF-SAMPLE____metric,model_2,model_3,model_4
0,Accuracy,82.0,81.0,81.0
1,Recall,85.0,84.0,84.0
2,True Positive Rate,85.0,84.0,84.0
3,False Positive Rate,23.0,23.0,25.0
4,True Negative Rate,77.0,77.0,75.0
5,False Negative Rate,15.0,16.0,16.0
6,Precision,85.0,85.0,84.0
7,F1-Score,85.0,85.0,84.0
8,Support,8200.0,8200.0,8200.0


5) Choose your best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

Using model 2 to evaluate my test dataset

In [1138]:
#model 2 test confusion matrix

confusion_matrix(y_test, t_pred)

array([[66, 17],
       [18, 33]])

In [1139]:
#model 2 test metrics

theometrics(66, 33, 18, 17)

Unnamed: 0,metric,value
0,Accuracy,0.738806
1,Recall,0.795181
2,True Positive Rate,0.795181
3,False Positive Rate,0.352941
4,True Negative Rate,0.647059
5,False Negative Rate,0.204819
6,Precision,0.785714
7,F1-Score,0.790419
8,Support,83.0
