# Logistic Regression

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from prepare import prep_titanic, theometrics

### 1)
   Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?

In [71]:
#calling the prep_titanic function from prepare.py to split my set into train, validate and test

train, val, test = prep_titanic()
train.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
748,0,19.0,1,0,53.1,0,True,True,False,False,False,False,True
45,0,28.0,0,0,8.05,1,True,False,False,True,False,False,True
28,1,28.0,0,0,7.8792,1,False,False,False,True,False,True,False
633,0,28.0,0,0,0.0,1,True,True,False,False,False,False,True
403,0,28.0,1,0,15.85,0,True,False,False,True,False,False,True


In [72]:
#defining my train and val subsets and only including age, fare, and pclass

X_train = train.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 'sex_male', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train = train.survived

X_val = val.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 'sex_male', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_val = val.survived

In [73]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['age', 'fare']] = mms.fit_transform(X_train[['age', 'fare']])
X_val[['age', 'fare']] = mms.transform(X_val[['age', 'fare']])

X_train.head()

Unnamed: 0,age,fare,class_First,class_Second,class_Third
748,0.233476,0.103644,True,False,False
45,0.346569,0.015713,False,False,True
28,0.346569,0.015379,False,False,True
633,0.346569,0.0,True,False,False
403,0.346569,0.030937,False,False,True


In [75]:
#baseline 

(y_train == 0).mean()

0.6163723916532905

In [76]:
#building the LogisticRegression model

seed = 42

logreg = LogisticRegression(random_state = seed, max_iter = 300, solver = 'liblinear', penalty = 'l1')

In [77]:
#fitting the model on the train subsets

logreg.fit(X_train, y_train)

In [78]:
pd.DataFrame({'feature': X_train.columns,
              'coefficient': logreg.coef_[0]})

Unnamed: 0,feature,coefficient
0,age,-2.617549
1,fare,2.046834
2,class_First,1.457
3,class_Second,0.643659
4,class_Third,-0.329663


### model score

In [79]:
#testing the performance of the train subset

logreg.score(X_train, y_train)

0.709470304975923

In [80]:
#testing the performance of the val subset

logreg.score(X_val, y_val)

0.7089552238805971

In [81]:
#generating the x_train and x_val predictions and assigning it to y_pred and v_pred

y_pred = logreg.predict(X_train)
v_pred = logreg.predict(X_val)

### confusion matrix

In [82]:
confusion_matrix(y_train, y_pred)

array([[333,  51],
       [130, 109]])

### classification report

In [83]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79       384
           1       0.68      0.46      0.55       239

    accuracy                           0.71       623
   macro avg       0.70      0.66      0.67       623
weighted avg       0.70      0.71      0.69       623



### train metrics

In [84]:
theometrics(333, 109, 130, 51)

Unnamed: 0,metric,value
0,Accuracy,0.70947
1,Recall,0.867188
2,True Positive Rate,0.867188
3,False Positive Rate,0.543933
4,True Negative Rate,0.456067
5,False Negative Rate,0.132812
6,Precision,0.719222
7,F1-Score,0.786305
8,Support,384.0


### val metrics

In [85]:
confusion_matrix(y_val, v_pred)

array([[71, 11],
       [28, 24]])

In [86]:
theometrics(71, 24, 28, 11)

Unnamed: 0,metric,value
0,Accuracy,0.708955
1,Recall,0.865854
2,True Positive Rate,0.865854
3,False Positive Rate,0.538462
4,True Negative Rate,0.461538
5,False Negative Rate,0.134146
6,Precision,0.717172
7,F1-Score,0.78453
8,Support,82.0


### validate model

In [87]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(logreg.score(X_val, y_val)))

Accuracy of KNN classifier on test set: 0.71


The model performs a little better than the baseline.

### 2)
   Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.

In [88]:
#defining my train and val subsets and only including age, fare, and pclass

X_train = train.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_train = train.survived

X_val = val.drop(columns = ['survived', 'sibsp', 'parch', 'alone', 
                                'embark_town_Cherbourg', 'embark_town_Queenstown', 'embark_town_Southampton'])
y_val = val.survived

In [89]:
#transforming age and fare in both the X_train and X_val subsets using MinMaxScaler()

mms = MinMaxScaler()

X_train[['age', 'fare']] = mms.fit_transform(X_train[['age', 'fare']])
X_val[['age', 'fare']] = mms.transform(X_val[['age', 'fare']])

X_train.head()

Unnamed: 0,age,fare,sex_male,class_First,class_Second,class_Third
748,0.233476,0.103644,True,True,False,False
45,0.346569,0.015713,True,False,False,True
28,0.346569,0.015379,False,False,False,True
633,0.346569,0.0,True,True,False,False
403,0.346569,0.030937,True,False,False,True


In [91]:
#baseline 

(y_train == 0).mean()

0.6163723916532905

In [92]:
#building the LogisticRegression model

seed = 42

logreg = LogisticRegression(random_state = seed, max_iter = 300, solver = 'liblinear', penalty = 'l1')

In [93]:
#fitting the model on the train subsets

logreg.fit(X_train, y_train)

In [94]:
pd.DataFrame({'feature': X_train.columns,
              'coefficient': logreg.coef_[0]})

Unnamed: 0,feature,coefficient
0,age,-2.288738
1,fare,0.0
2,sex_male,-2.54352
3,class_First,2.142358
4,class_Second,0.999868
5,class_Third,0.0


In [95]:
#testing the performance of the train subset

logreg.score(X_train, y_train)

0.8025682182985554

In [96]:
#testing the performance of the val subset

logreg.score(X_val, y_val)

0.8208955223880597

In [97]:
#generating the x_train and x_val predictions and assigning it to y_pred and v_pred

y_pred = logreg.predict(X_train)
v_pred = logreg.predict(X_val)

### confusion matrix

In [98]:
confusion_matrix(y_train, y_pred)

array([[333,  51],
       [ 72, 167]])

### classification report

In [99]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       384
           1       0.77      0.70      0.73       239

    accuracy                           0.80       623
   macro avg       0.79      0.78      0.79       623
weighted avg       0.80      0.80      0.80       623



### train metrics

In [104]:
theometrics(333, 167, 72, 51)

Unnamed: 0,metric,value
0,Accuracy,0.802568
1,Recall,0.867188
2,True Positive Rate,0.867188
3,False Positive Rate,0.301255
4,True Negative Rate,0.698745
5,False Negative Rate,0.132812
6,Precision,0.822222
7,F1-Score,0.844106
8,Support,384.0


### val metrics

In [101]:
confusion_matrix(y_val, v_pred)

array([[70, 12],
       [12, 40]])

In [105]:
theometrics(70, 40, 12, 12)

Unnamed: 0,metric,value
0,Accuracy,0.820896
1,Recall,0.853659
2,True Positive Rate,0.853659
3,False Positive Rate,0.230769
4,True Negative Rate,0.769231
5,False Negative Rate,0.146341
6,Precision,0.853659
7,F1-Score,0.853659
8,Support,82.0


### validate model

In [106]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(logreg.score(X_val, y_val)))

Accuracy of KNN classifier on test set: 0.82


The model with sex included performs 10% better than the first model.

### 3)
   Try out other combinations of features and models.

Trying out a model with age, fare, pclass, sex and embark_town_Queenstown

