# Classification using Logisitic Regression

In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

from acquire import get_titanic_data
from prepare import prep_titanic    

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_titanic = prep_titanic()

In [3]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Queenstown,Southampton,Second,Third,male
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,0,1,0,1,1
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0,0,0,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,0,1,0,1,0
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,0,1,0,0,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,0,1,0,1,1


In [4]:
df_titanic.dropna(how='any', subset=['age'], inplace=True)

# Test Models
## _Model 1_

In [5]:
# X = df_titanic.loc[:, 'fare':'class_Third']
X = df_titanic[['pclass','age','fare','sibsp','parch']]
y = df_titanic['survived']

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X,
                                                                      y,
                                                                      test_size=.2,
                                                                      random_state=123
                                                                      )

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate,
                                                            y_train_validate,
                                                            test_size=.3,
                                                            random_state=123
                                                            )


print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (399, 5) , validate:  (172, 5) , test:  (143, 5)
train:  (399,) , validate:  (172,) , test:  (143,)


In [6]:
logit_test_model_1 = LogisticRegression(C=1,
                                   class_weight={0:1, 1:99},
                                   random_state=123,
                                   intercept_scaling=1,
                                   solver='lbfgs'
                                   )

In [7]:
logit_test_model_1.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={0: 1, 1: 99}, random_state=123)

In [8]:
print('Coefficient: \n', logit_test_model_1.coef_)
print('Intercept: \n', logit_test_model_1.intercept_)

Coefficient: 
 [[-1.16035326 -0.03110108  0.00399115 -0.45129797  0.52083144]]
Intercept: 
 [7.65390045]


In [9]:
y_pred = logit_test_model_1.predict(X_train)
y_pred_proba = logit_test_model_1.predict_proba(X_train)
print(y_pred)
print(y_pred_proba)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[[5.38032613e-03 9.94619674e-01]
 [9.95313098e-03 9.90046869e-01]
 [1.98570853e-03 9.98014291e-01]
 [1.01714994e-02 9.89828501e-01]
 [3.88457002e-02 9.61154300e-01]
 [2.78936731e-02 9.721063

In [10]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_test_model_1.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.43


In [11]:
print(confusion_matrix(y_train, y_pred))

[[  0 228]
 [  0 171]]


In [12]:
print(classification_report(y_train, y_pred))

precision    recall  f1-score   support

           0       0.00      0.00      0.00       228
           1       0.43      1.00      0.60       171

    accuracy                           0.43       399
   macro avg       0.21      0.50      0.30       399
weighted avg       0.18      0.43      0.26       399



## _Model 2_

In [13]:
logit_test_model_2 = LogisticRegression(C=.1,
class_weight={0:1, 1:99},
random_state=123,
intercept_scaling=1,
solver='lbfgs')

In [14]:
logit_test_model_2.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight={0: 1, 1: 99}, random_state=123)

In [15]:
print('Coefficient: \n', logit_test_model_2.coef_)
print('Intercept: \n', logit_test_model_2.intercept_)

Coefficient: 
 [[-1.00329345 -0.02877517  0.00604013 -0.40518742  0.44919365]]
Intercept: 
 [7.14670159]


In [16]:
y_pred2 = logit_test_model_2.predict(X_train)
y_pred_proba2 = logit_test_model_2.predict_proba(X_train)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_test_model_2.score(X_train, y_train)))

print(confusion_matrix(y_train, y_pred2))

print(classification_report(y_train, y_pred2))

Accuracy of Logistic Regression classifier on training set: 0.43
[[  0 228]
 [  0 171]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       228
           1       0.43      1.00      0.60       171

    accuracy                           0.43       399
   macro avg       0.21      0.50      0.30       399
weighted avg       0.18      0.43      0.26       399



# Logistic Regression Exercises

## 1.
Start by defining your baseline model. 

## 2.
Create another model that includes `age` in addition to `fare` and `pclass`. Does this model perform better than your previous one?
### Model 1

In [17]:
X = df_titanic[['pclass', 'age', 'fare']]
y = df_titanic['survived']

X_train_validate, X_test, y_train_validate, y_test = train_test_split(X,
                                                                      y,
                                                                      test_size=.2,
                                                                      random_state=123
                                                                      )

X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate,
                                                            y_train_validate,
                                                            test_size=.3,
                                                            random_state=123
                                                            )


print("train: ", X_train.shape, ", validate: ", X_validate.shape, ", test: ", X_test.shape)
print("train: ", y_train.shape, ", validate: ", y_validate.shape, ", test: ", y_test.shape)

train:  (399, 3) , validate:  (172, 3) , test:  (143, 3)
train:  (399,) , validate:  (172,) , test:  (143,)


In [18]:
logit = LogisticRegression(random_state=123, solver='lbfgs')

In [19]:
logit.fit(X_train, y_train)

LogisticRegression(random_state=123)

In [20]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

y_pred = logit.predict(X_train)
y_pred_proba = logit.predict_proba(X_train)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

print(confusion_matrix(y_train, y_pred))

print(classification_report(y_train, y_pred))

Coefficient: 
 [[-1.00444388 -0.02941366  0.00518596]]
Intercept: 
 [2.60928736]
Accuracy of Logistic Regression classifier on training set: 0.70
[[191  37]
 [ 81  90]]
              precision    recall  f1-score   support

           0       0.70      0.84      0.76       228
           1       0.71      0.53      0.60       171

    accuracy                           0.70       399
   macro avg       0.71      0.68      0.68       399
weighted avg       0.70      0.70      0.70       399



## 3.
Include `sex` in your model as well.
> Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

### Model 2

In [21]:
X = df_titanic[['pclass', 'age', 'fare', 'male']]
y = df_titanic['survived']

X_train_validate2, X_test2, y_train_validate2, y_test2 = train_test_split(X,
                                                                      y,
                                                                      test_size=.2,
                                                                      random_state=123
                                                                      )

X_train2, X_validate2, y_train2, y_validate2 = train_test_split(X_train_validate,
                                                            y_train_validate,
                                                            test_size=.3,
                                                            random_state=123
                                                            )


print("train: ", X_train2.shape, ", validate: ", X_validate2.shape, ", test: ", X_test2.shape)
print("train: ", y_train2.shape, ", validate: ", y_validate2.shape, ", test: ", y_test2.shape)

train:  (399, 3) , validate:  (172, 3) , test:  (143, 4)
train:  (399,) , validate:  (172,) , test:  (143,)


In [22]:
logit_2 = LogisticRegression(random_state=123, intercept_scaling=1, solver='lbfgs')

In [23]:
logit_2.fit(X_train2, y_train2)

print('Coefficient: \n', logit_2.coef_)
print('Intercept: \n', logit_2.intercept_)

y_pred2 = logit_2.predict(X_train2)
y_pred_proba2 = logit_2.predict_proba(X_train2)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_2.score(X_train2, y_train2)))

print(confusion_matrix(y_train2, y_pred2))

print(classification_report(y_train2, y_pred2))

Coefficient: 
 [[-1.00444388 -0.02941366  0.00518596]]
Intercept: 
 [2.60928736]
Accuracy of Logistic Regression classifier on training set: 0.70
[[191  37]
 [ 81  90]]
              precision    recall  f1-score   support

           0       0.70      0.84      0.76       228
           1       0.71      0.53      0.60       171

    accuracy                           0.70       399
   macro avg       0.71      0.68      0.68       399
weighted avg       0.70      0.70      0.70       399



## 4.
Try out other combinations of features and models.

### Model 3

In [24]:
X = df_titanic[['pclass', 'age', 'fare', 'male', 'Queenstown', 'Southampton']]
y = df_titanic['survived']

X_train_validate3, X_test3, y_train_validate3, y_test3 = train_test_split(X,
                                                                          y,
                                                                          test_size=.2,
                                                                          random_state=123
                                                                          )

X_train3, X_validate3, y_train3, y_validate3 = train_test_split(X_train_validate3,
                                                            y_train_validate3,
                                                            test_size=.3,
                                                            random_state=123
                                                            )


print("train: ", X_train3.shape, ", validate: ", X_validate3.shape, ", test: ", X_test3.shape)
print("train: ", y_train3.shape, ", validate: ", y_validate3.shape, ", test: ", y_test3.shape)

train:  (399, 6) , validate:  (172, 6) , test:  (143, 6)
train:  (399,) , validate:  (172,) , test:  (143,)


In [25]:
logit_3 = LogisticRegression(random_state=123, solver='lbfgs')

In [26]:
logit_3.fit(X_train3, y_train3)

print('Coefficient: \n', logit_3.coef_)
print('Intercept: \n', logit_3.intercept_)

y_pred3 = logit_3.predict(X_train3)
y_pred_proba3 = logit_3.predict_proba(X_train3)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_3.score(X_train3, y_train3)))

print(confusion_matrix(y_train3, y_pred3))

print(classification_report(y_train3, y_pred3))

Coefficient: 
 [[-1.05899184e+00 -2.77741850e-02  2.50423675e-03 -2.53771314e+00
  -9.11492204e-01 -4.04828933e-01]]
Intercept: 
 [4.71610197]
Accuracy of Logistic Regression classifier on training set: 0.81
[[196  32]
 [ 44 127]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       228
           1       0.80      0.74      0.77       171

    accuracy                           0.81       399
   macro avg       0.81      0.80      0.80       399
weighted avg       0.81      0.81      0.81       399



### Model 4

In [27]:
X = df_titanic[['fare', 'male']]
y = df_titanic['survived']

X_train_validate4, X_test4, y_train_validate4, y_test4 = train_test_split(X,
                                                                      y,
                                                                      test_size=.2,
                                                                      random_state=123
                                                                      )

X_train4, X_validate4, y_train4, y_validate4 = train_test_split(X_train_validate4,
                                                            y_train_validate4,
                                                            test_size=.3,
                                                            random_state=123
                                                            )


print("train: ", X_train4.shape, ", validate: ", X_validate4.shape, ", test: ", X_test4.shape)
print("train: ", y_train4.shape, ", validate: ", y_validate4.shape, ", test: ", y_test4.shape)

train:  (399, 2) , validate:  (172, 2) , test:  (143, 2)
train:  (399,) , validate:  (172,) , test:  (143,)


In [28]:
logit_4 = LogisticRegression(random_state=123, solver='lbfgs')

In [30]:
logit_4.fit(X_train4, y_train4)

print('Coefficient: \n', logit_4.coef_)
print('Intercept: \n', logit_4.intercept_)

y_pred4 = logit_4.predict(X_train4)
y_pred_proba4 = logit_4.predict_proba(X_train4)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit_4.score(X_train4, y_train4)))

print(confusion_matrix(y_train4, y_pred4))

print(classification_report(y_train4, y_pred4))

Coefficient: 
 [[ 0.01527599 -2.44659372]]
Intercept: 
 [0.74930988]
Accuracy of Logistic Regression classifier on training set: 0.79
[[196  32]
 [ 51 120]]
              precision    recall  f1-score   support

           0       0.79      0.86      0.83       228
           1       0.79      0.70      0.74       171

    accuracy                           0.79       399
   macro avg       0.79      0.78      0.78       399
weighted avg       0.79      0.79      0.79       399



## 5.
Use the best 3 models to predict and evaluate on the validate sample.

In [33]:
print("Accuracy Scores")
print('-' * 15)
print(f"Model 2: {logit_2.score(X_train2, y_train2):.2%}")
print(f"Model 3: {logit_3.score(X_train3, y_train3):.2%}")
print(f"Model 4: {logit_4.score(X_train4, y_train4):.2%}")

Accuracy Scores
---------------
Model 2: 70.43%
Model 3: 80.95%
Model 4: 79.20%


In [37]:
y_pred2 = logit_2.predict(X_validate2)
y_pred3 = logit_3.predict(X_validate3)
y_pred4 = logit_4.predict(X_validate4)

print("Model 2: solver = lbfgs, c = 1")

print('Accuracy: {:.3f}'.format(logit_2.score(X_validate2, y_validate2)))

print(confusion_matrix(y_validate2, y_pred2))

print(classification_report(y_validate2, y_pred2))

print("Model 3: solver = lbfgs, c = 1")

print('Accuracy: {:.3f}'.format(logit_3.score(X_validate3, y_validate3)))

print(confusion_matrix(y_validate3, y_pred3))

print(classification_report(y_validate3, y_pred3))

print("Model 4: solver = lbfgs, c = 1")

print('Accuracy: {:.3f}'.format(logit_4.score(X_validate4, y_validate4)))

print(confusion_matrix(y_validate4, y_pred4))

print(classification_report(y_validate4, y_pred4))


Model 2: solver = lbfgs, c = 1
Accuracy: 0.727
[[88 21]
 [26 37]]
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       109
           1       0.64      0.59      0.61        63

    accuracy                           0.73       172
   macro avg       0.70      0.70      0.70       172
weighted avg       0.72      0.73      0.72       172

Model 3: solver = lbfgs, c = .1
Accuracy: 0.779
[[85 24]
 [14 49]]
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       109
           1       0.67      0.78      0.72        63

    accuracy                           0.78       172
   macro avg       0.76      0.78      0.77       172
weighted avg       0.79      0.78      0.78       172

Model 4: solver = lbfgs, c = .1
Accuracy: 0.779
[[89 20]
 [18 45]]
              precision    recall  f1-score   support

           0       0.83      0.82      0.82       109
           1       0.69      0.71    

##  6.
Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train? 

In [42]:
y_pred3 = logit_3.predict(X_test3)

print("Model 4: solver = lbfgs, c = 1")

print('Accuracy: {:.3f}'.format(logit_3.score(X_test3, y_test3)))

print(confusion_matrix(y_test3, y_pred3))

print(classification_report(y_test3, y_pred3))

Model 4: solver = lbfgs, c = 1
Accuracy: 0.755
[[68 19]
 [16 40]]
              precision    recall  f1-score   support

           0       0.81      0.78      0.80        87
           1       0.68      0.71      0.70        56

    accuracy                           0.76       143
   macro avg       0.74      0.75      0.75       143
weighted avg       0.76      0.76      0.76       143

