In [3]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from acquire import get_titanic_data
from prepare import train_validate_test_split

In [52]:
df= get_titanic_data()
df=pd.DataFrame(df)
df= get_titanic_data()
df= df.drop(columns=['Unnamed: 0','passenger_id','deck','embarked','pclass'])
df['embark_town']= df.embark_town.fillna(value='Southampton')

df = pd.get_dummies(df, ['sex', 'class', 'embark_town'], drop_first=True)
df.head()

Unnamed: 0,survived,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
0,0,22.0,1,0,7.25,0,1,0,1,0,1
1,1,38.0,1,0,71.2833,0,0,0,0,0,0
2,1,26.0,0,0,7.925,1,0,0,1,0,1
3,1,35.0,1,0,53.1,0,0,0,0,0,1
4,0,35.0,0,0,8.05,1,1,0,1,0,1


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 891 non-null    int64  
 1   age                      714 non-null    float64
 2   sibsp                    891 non-null    int64  
 3   parch                    891 non-null    int64  
 4   fare                     891 non-null    float64
 5   alone                    891 non-null    int64  
 6   sex_male                 891 non-null    uint8  
 7   class_Second             891 non-null    uint8  
 8   class_Third              891 non-null    uint8  
 9   embark_town_Queenstown   891 non-null    uint8  
 10  embark_town_Southampton  891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 46.2 KB


In [54]:
## split the data to train, validate and test
train,validate,test= train_validate_test_split(df, target='survived', seed=123)

In [55]:
## fill the null values of train dataset on age column with mean value
train.age= train.age.fillna(train.age.mean())
validate.age=validate.age.fillna(validate.age.mean())
test.age= test.age.fillna(test.age.mean())


## Model1 with features including age

In [116]:
X_train1= train[['age','fare','class_Second','class_Third']]
y_train1= train.survived

X_validate1=validate[['age','fare','class_Second','class_Third']]
y_validate1= validate.survived


X_test1=test[['age','fare','class_Second','class_Third']]
y_test1= test.survived


In [117]:
X_train1.shape

(498, 4)

In [84]:
# creating model1 as logistic regression model
model1=LogisticRegression(C=1, random_state=123)

In [85]:
# fit the model on train data
model1.fit(X_train1, y_train1)

LogisticRegression(C=1, random_state=123)

In [90]:
#using model1 to make predictions
y_pred1= model1.predict(X_train1)
y_pred1

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,

In [86]:
model1.predict_proba(X_train1)[:3]

array([[0.43263459, 0.56736541],
       [0.65658207, 0.34341793],
       [0.62863948, 0.37136052]])

# checking class attributes
model1.classes_

In [87]:
#feature importance
print('Coefficient: \n', model1.coef_)
print('Intercept: \n', model1.intercept_)


Coefficient: 
 [[-0.02728613  0.00350386 -0.3740661  -1.58725954]]
Intercept: 
 [1.11281848]


In [91]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(model1.score(X_train1, y_train1)))


Accuracy of Logistic Regression classifier on training set: 0.69


In [93]:
print(confusion_matrix(y_train1,y_pred1))

[[249  58]
 [ 98  93]]


In [94]:
print(classification_report(y_train1, y_pred1))

              precision    recall  f1-score   support

           0       0.72      0.81      0.76       307
           1       0.62      0.49      0.54       191

    accuracy                           0.69       498
   macro avg       0.67      0.65      0.65       498
weighted avg       0.68      0.69      0.68       498



## Model 2 with sex column

In [114]:
X_train2= train[['age','fare','class_Second','class_Third','sex_male']]
y_train2= train.survived

X_validate2=validate[['age','fare','class_Second','class_Third','sex_male']]
y_validate2= validate.survived


X_test2=test[['age','fare','class_Second','class_Third','sex_male']]
y_test2= test.survived

In [115]:
X_train2.shape

(498, 5)

In [96]:
##column with sex,class, fare and age 
X_train2.head()

Unnamed: 0,age,fare,class_Second,class_Third,sex_male
583,36.0,40.125,0,0,1
165,9.0,20.525,0,1,1
50,7.0,39.6875,0,1,1
259,50.0,26.0,1,0,0
306,29.678105,110.8833,0,0,0


In [73]:
model2=LogisticRegression(C=1, random_state=123)


In [97]:
model2.fit(X_train2, y_train2)

LogisticRegression(C=1, random_state=123)

In [98]:
y_pred2= model2.predict(X_train2)
y_pred2

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

In [99]:
print(classification_report(y_train2, y_pred2))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       307
           1       0.77      0.69      0.73       191

    accuracy                           0.80       498
   macro avg       0.80      0.78      0.79       498
weighted avg       0.80      0.80      0.80       498



## With all features

In [101]:
X_train3= train.drop(columns='survived')
y_train3= train.survived

X_validate3=validate.drop(columns='survived')
y_validate3= validate.survived


X_test3=test.drop(columns='survived')
y_test3= test.survived



In [118]:
X_train3.shape

(498, 10)

In [119]:
X_train3.head()

Unnamed: 0,age,sibsp,parch,fare,alone,sex_male,class_Second,class_Third,embark_town_Queenstown,embark_town_Southampton
583,36.0,0,0,40.125,1,1,0,0,0,0
165,9.0,0,2,20.525,0,1,0,1,0,1
50,7.0,4,1,39.6875,0,1,0,1,0,1
259,50.0,0,1,26.0,0,0,1,0,0,1
306,29.678105,0,0,110.8833,1,0,0,0,0,0


In [102]:
logit=LogisticRegression(C=0.1).fit(X_train3,y_train3)

In [103]:
y_pred3=logit.predict(X_train3)
y_pred3

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [104]:
print(classification_report(y_train3, y_pred3))

              precision    recall  f1-score   support

           0       0.83      0.91      0.86       307
           1       0.82      0.70      0.75       191

    accuracy                           0.83       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.83      0.82       498



### validate model

In [105]:
y_pred1=model1.predict(X_validate1)
y_pred2= model2.predict(X_validate2)
y_pred3=logit.predict(X_validate3)
print('Validation Model 1')
print(classification_report(y_validate1,y_pred1))

print('validation Model 2')
print(classification_report(y_validate2,y_pred2))


print('validation Model 3')
print(classification_report(y_validate3,y_pred2))



Validation Model 1
              precision    recall  f1-score   support

           0       0.69      0.86      0.77       132
           1       0.63      0.39      0.48        82

    accuracy                           0.68       214
   macro avg       0.66      0.62      0.62       214
weighted avg       0.67      0.68      0.66       214

validation Model 2
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214

validation Model 3
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg     

## Test Model
- since Model3 outperformed other two models, model 3 is the chosen one

In [109]:
y_pred= logit.predict(X_test3)
y_pred

array([0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1])

In [110]:
print(classification_report(y_test3, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85       110
           1       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179



In [113]:
## accuracy of test and train model is near 80% while validation is around 78%
print('Accuracy for train data: {:.2f}'.format(logit.score(X_train3, y_train3)))
print('Accuracy for validate data: {:.2f}'.format(logit.score(X_validate3, y_validate3)))
print('Accuracy for test data: {:.2f}'.format(logit.score(X_test3, y_test3)))

Accuracy for train data: 0.83
Accuracy for validate data: 0.79
Accuracy for test data: 0.82
