In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay


from pydataset import data
import acquire as ac
import env
import prepare as pr


In [2]:
def compute_metrics(TN,FP,FN,TP):
    all_ = (TP + TN + FP + FN)

    accuracy = (TP + TN) / all_

    TPR = recall = TP / (TP + FN)
    FPR = FP / (FP + TN)

    TNR = TN / (FP + TN)
    FNR = FN / (FN + TP)

    precision =  TP / (TP + FP)
    f1 =  2 * ((precision * recall) / ( precision + recall))

    support_pos = TP + FN
    support_neg = FP + TN

    print(f"Accuracy: {accuracy}\n")
    print(f"True Positive Rate/Sensitivity/Recall/Power: {TPR}")
    print(f"False Positive Rate/False Alarm Ratio/Fall-out: {FPR}")
    print(f"True Negative Rate/Specificity/Selectivity: {TNR}")
    print(f"False Negative Rate/Miss Rate: {FNR}\n")
    print(f"Precision/PPV: {precision}")
    print(f"F1 Score: {f1}\n")
    print(f"Support (0): {support_pos}")
    print(f"Support (1): {support_neg}")

#### In these exercises, we'll continue working with the titanic dataset and building logistic regression models. 

#### Throughout this exercise, be sure you are training, evaluation, and comparing models on the train and validate datasets. 

#### The test dataset should only be used for your final model.



In [3]:
titanic = ac.get_titanic_data()

titanic


dummy_df = pd.get_dummies(titanic[['sex','embarked']], drop_first=True).astype(int)
titanic = pd.concat([titanic, dummy_df], axis=1)

titanic = titanic.drop(columns=['passenger_id', 'sex', 'embarked', 'class', 'deck', 'embark_town'])

titanic['age'] = titanic['age'].fillna(29)

In [4]:
titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,3,35.0,0,0,8.05,1,1,0,1


### For all of the models you create, choose a threshold that optimizes for accuracy.



In [5]:
from prepare import split_titanic_data # for some reason this fuction is working but the other ones are not
train, validate, test = split_titanic_data(titanic)


print(f'train data -> {train.shape}')
print(f'validate data -> {validate.shape}')
print(f'test data -> {test.shape}')

train data -> (498, 10)
validate data -> (214, 10)
test data -> (179, 10)


In [6]:
# create a base line using the whole data and not the split data 
# as long as i use either the test or the whole data I should be good for my accuracy 

titanic.survived.value_counts()


survived
0    549
1    342
Name: count, dtype: int64

In [7]:
baseline_accuracy = (titanic.survived == 0).mean()
baseline_accuracy

0.6161616161616161

### Create a model that includes only age, fare, and pclass. Does this model perform better than your baseline?



In [8]:
# because this is my first model i want to train it using the the test data 
# view my columns 

train.columns

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'alone',
       'sex_male', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [9]:
# SPLIT DATA INTO X AND Y DATA SETS
# this will allow us to separate the target variable and the features 

X_train = train.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'sex_male', 'embarked_Q', 'embarked_S']) # all the features, will NOT contain the target variable 

X_validate = validate.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'sex_male', 'embarked_Q', 'embarked_S']) # same as above 

X_test = test.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'sex_male', 'embarked_Q', 'embarked_S']) # same as above 

y_train = train.survived # will only contain the target variable 

y_validate = validate.survived # same as above 

y_test = test.survived # same as above 

In [10]:
# check to see my new X_train data 
X_train.head()

Unnamed: 0,pclass,age,fare
583,1,36.0,40.125
165,3,9.0,20.525
50,3,7.0,39.6875
259,2,50.0,26.0
306,1,29.0,110.8833


In [11]:
# from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(C=1, random_state=123)
logit


In [12]:
logit.fit(X_train, y_train)

In [13]:
y_pred = logit.predict(X_train)
y_pred[:5]

array([1, 0, 0, 0, 1])

In [14]:
y_pred_proba = logit.predict_proba(X_train)
y_pred_proba[:5]

array([[0.37027356, 0.62972644],
       [0.63946249, 0.36053751],
       [0.61888235, 0.38111765],
       [0.70432576, 0.29567424],
       [0.3004738 , 0.6995262 ]])

In [15]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of KNN classifier on training set: 0.70


In [16]:
cm = confusion_matrix(y_train, y_pred )

TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
print(cm)
print('')
print(f'True Positive -> {TP}')
print(f'True Negative -> {TN}')
print(f'False Positive -> {FP}')
print(f'False Negative -> {FN}')

[[267  40]
 [108  83]]

True Positive -> 83
True Negative -> 267
False Positive -> 40
False Negative -> 108


In [17]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.87      0.78       307
           1       0.67      0.43      0.53       191

    accuracy                           0.70       498
   macro avg       0.69      0.65      0.66       498
weighted avg       0.70      0.70      0.69       498



In [18]:
compute_metrics(TN,FP,FN,TP)

Accuracy: 0.7028112449799196

True Positive Rate/Sensitivity/Recall/Power: 0.43455497382198954
False Positive Rate/False Alarm Ratio/Fall-out: 0.13029315960912052
True Negative Rate/Specificity/Selectivity: 0.8697068403908795
False Negative Rate/Miss Rate: 0.5654450261780105

Precision/PPV: 0.6747967479674797
F1 Score: 0.5286624203821656

Support (0): 191
Support (1): 307


### Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model.



In [19]:
# SPLIT DATA INTO X AND Y DATA SETS
# this will allow us to separate the target variable and the features 

X_train2 = train.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'embarked_Q', 'embarked_S']) # all the features, will NOT contain the target variable 

X_validate2 = validate.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'embarked_Q', 'embarked_S']) # same as above 

X_test2 = test.drop(columns=['survived', 'sibsp', 'parch', 'alone', 'embarked_Q', 'embarked_S']) # same as above 

# all the y trains have remained the same

In [20]:
# test new X_train 
X_train2.head()

Unnamed: 0,pclass,age,fare,sex_male
583,1,36.0,40.125,1
165,3,9.0,20.525,1
50,3,7.0,39.6875,1
259,2,50.0,26.0,0
306,1,29.0,110.8833,0


In [21]:
# from sklearn.linear_model import LogisticRegression
logit2 = LogisticRegression(C=1, random_state=123)
logit2

In [22]:
logit2.fit(X_train2, y_train)

In [23]:
y_pred = logit2.predict(X_train2)
y_pred[:5]

array([0, 0, 0, 1, 1])

In [24]:
y_pred_proba = logit2.predict_proba(X_train2)
y_pred_proba[:5]

array([[0.5593917 , 0.4406083 ],
       [0.86082718, 0.13917282],
       [0.8583555 , 0.1416445 ],
       [0.2932214 , 0.7067786 ],
       [0.0730453 , 0.9269547 ]])

In [25]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit2.score(X_train2, y_train)))

Accuracy of KNN classifier on training set: 0.82


In [26]:
cm = confusion_matrix(y_train, y_pred )

TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
print(cm)
print('')
print(f'True Positive -> {TP}')
print(f'True Negative -> {TN}')
print(f'False Positive -> {FP}')
print(f'False Negative -> {FN}')

[[267  40]
 [ 52 139]]

True Positive -> 139
True Negative -> 267
False Positive -> 40
False Negative -> 52


In [27]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85       307
           1       0.78      0.73      0.75       191

    accuracy                           0.82       498
   macro avg       0.81      0.80      0.80       498
weighted avg       0.81      0.82      0.81       498



In [28]:
compute_metrics(TN,FP,FN,TP)

Accuracy: 0.8152610441767069

True Positive Rate/Sensitivity/Recall/Power: 0.7277486910994765
False Positive Rate/False Alarm Ratio/Fall-out: 0.13029315960912052
True Negative Rate/Specificity/Selectivity: 0.8697068403908795
False Negative Rate/Miss Rate: 0.27225130890052357

Precision/PPV: 0.776536312849162
F1 Score: 0.7513513513513513

Support (0): 191
Support (1): 307


### Try out other combinations of features and models.



In [29]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
583,0,1,36.0,0,0,40.125,1,1,0,0
165,1,3,9.0,0,2,20.525,0,1,0,1
50,0,3,7.0,4,1,39.6875,0,1,0,1
259,1,2,50.0,0,1,26.0,0,0,0,1
306,1,1,29.0,0,0,110.8833,1,0,0,0


In [30]:
# SPLIT DATA INTO X AND Y DATA SETS
# this will allow us to separate the target variable and the features 

X_train3 = train.drop(columns=['survived', 'sibsp', 'parch', 'alone']) # all the features, will NOT contain the target variable 

X_validate3 = validate.drop(columns=['survived', 'sibsp', 'parch', 'alone']) # same as above 

X_test3 = test.drop(columns=['survived', 'sibsp', 'parch', 'alone']) # same as above 

# all the y trains have remained the same

In [31]:
X_train3.head()

Unnamed: 0,pclass,age,fare,sex_male,embarked_Q,embarked_S
583,1,36.0,40.125,1,0,0
165,3,9.0,20.525,1,0,1
50,3,7.0,39.6875,1,0,1
259,2,50.0,26.0,0,0,1
306,1,29.0,110.8833,0,0,0


In [32]:
# from sklearn.linear_model import LogisticRegression
logit3 = LogisticRegression(C=1, random_state=123)
logit3

In [33]:
logit3.fit(X_train3, y_train)

In [34]:
y_pred = logit3.predict(X_train3)
y_pred[:5]

array([0, 0, 0, 1, 1])

In [35]:
y_pred_proba = logit3.predict_proba(X_train3)
y_pred_proba[:5]

array([[0.52766831, 0.47233169],
       [0.87271637, 0.12728363],
       [0.8711822 , 0.1288178 ],
       [0.31583322, 0.68416678],
       [0.06878303, 0.93121697]])

In [36]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit3.score(X_train3, y_train)))

Accuracy of KNN classifier on training set: 0.80


In [37]:
cm = confusion_matrix(y_train, y_pred )

TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
print(cm)
print('')
print(f'True Positive -> {TP}')
print(f'True Negative -> {TN}')
print(f'False Positive -> {FP}')
print(f'False Negative -> {FN}')

[[261  46]
 [ 52 139]]

True Positive -> 139
True Negative -> 261
False Positive -> 46
False Negative -> 52


In [38]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       307
           1       0.75      0.73      0.74       191

    accuracy                           0.80       498
   macro avg       0.79      0.79      0.79       498
weighted avg       0.80      0.80      0.80       498



In [39]:
compute_metrics(TN,FP,FN,TP)

Accuracy: 0.8032128514056225

True Positive Rate/Sensitivity/Recall/Power: 0.7277486910994765
False Positive Rate/False Alarm Ratio/Fall-out: 0.1498371335504886
True Negative Rate/Specificity/Selectivity: 0.8501628664495114
False Negative Rate/Miss Rate: 0.27225130890052357

Precision/PPV: 0.7513513513513513
F1 Score: 0.7393617021276596

Support (0): 191
Support (1): 307


In [40]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
583,0,1,36.0,0,0,40.125,1,1,0,0
165,1,3,9.0,0,2,20.525,0,1,0,1
50,0,3,7.0,4,1,39.6875,0,1,0,1
259,1,2,50.0,0,1,26.0,0,0,0,1
306,1,1,29.0,0,0,110.8833,1,0,0,0


In [41]:
# SPLIT DATA INTO X AND Y DATA SETS
# this will allow us to separate the target variable and the features 

X_train4 = train.drop(columns=['survived', 'sibsp', 'alone']) # all the features, will NOT contain the target variable 

X_validate4 = validate.drop(columns=['survived', 'sibsp', 'alone']) # same as above 

X_test4 = test.drop(columns=['survived', 'sibsp', 'alone']) # same as above 

# all the y trains have remained the same

In [42]:
X_test4.head()

Unnamed: 0,pclass,age,parch,fare,sex_male,embarked_Q,embarked_S
561,3,40.0,0,7.8958,1,0,1
641,1,24.0,0,69.3,0,0,0
400,3,39.0,0,7.925,1,0,1
498,1,25.0,2,151.55,0,0,1
875,3,15.0,0,7.225,0,0,0


In [43]:
# from sklearn.linear_model import LogisticRegression
logit4 = LogisticRegression(C=1, random_state=123)
logit4

In [44]:
logit4.fit(X_train4, y_train)

In [45]:
y_pred = logit4.predict(X_train4)
y_pred[:5]

array([0, 0, 0, 1, 1])

In [46]:
y_pred_proba = logit4.predict_proba(X_train4)
y_pred_proba[:5]

array([[0.55474486, 0.44525514],
       [0.88750218, 0.11249782],
       [0.87042344, 0.12957656],
       [0.33592518, 0.66407482],
       [0.06935277, 0.93064723]])

In [47]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit4.score(X_train4, y_train)))

Accuracy of KNN classifier on training set: 0.81


In [48]:
cm = confusion_matrix(y_train, y_pred )

TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
print(cm)
print('')
print(f'True Positive -> {TP}')
print(f'True Negative -> {TN}')
print(f'False Positive -> {FP}')
print(f'False Negative -> {FN}')

[[263  44]
 [ 49 142]]

True Positive -> 142
True Negative -> 263
False Positive -> 44
False Negative -> 49


In [49]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       307
           1       0.76      0.74      0.75       191

    accuracy                           0.81       498
   macro avg       0.80      0.80      0.80       498
weighted avg       0.81      0.81      0.81       498



In [50]:
compute_metrics(TN,FP,FN,TP)

Accuracy: 0.8132530120481928

True Positive Rate/Sensitivity/Recall/Power: 0.743455497382199
False Positive Rate/False Alarm Ratio/Fall-out: 0.14332247557003258
True Negative Rate/Specificity/Selectivity: 0.8566775244299675
False Negative Rate/Miss Rate: 0.25654450261780104

Precision/PPV: 0.7634408602150538
F1 Score: 0.753315649867374

Support (0): 191
Support (1): 307


In [51]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,sex_male,embarked_Q,embarked_S
583,0,1,36.0,0,0,40.125,1,1,0,0
165,1,3,9.0,0,2,20.525,0,1,0,1
50,0,3,7.0,4,1,39.6875,0,1,0,1
259,1,2,50.0,0,1,26.0,0,0,0,1
306,1,1,29.0,0,0,110.8833,1,0,0,0


In [52]:
# SPLIT DATA INTO X AND Y DATA SETS
# this will allow us to separate the target variable and the features 

X_train5 = train.drop(columns=['survived', 'sibsp','embarked_Q', 'embarked_S']) # all the features, will NOT contain the target variable 

X_validate5 = validate.drop(columns=['survived', 'sibsp','embarked_Q', 'embarked_S']) # same as above 

X_test5 = test.drop(columns=['survived', 'sibsp','embarked_Q', 'embarked_S']) # same as above 

# all the y trains have remained the same

In [53]:
X_test5.head()

Unnamed: 0,pclass,age,parch,fare,alone,sex_male
561,3,40.0,0,7.8958,1,1
641,1,24.0,0,69.3,1,0
400,3,39.0,0,7.925,1,1
498,1,25.0,2,151.55,0,0
875,3,15.0,0,7.225,1,0


In [54]:
# from sklearn.linear_model import LogisticRegression
logit5 = LogisticRegression(C=1, random_state=123)
logit5

In [55]:
logit5.fit(X_train5, y_train)

In [56]:
y_pred = logit5.predict(X_train5)
y_pred[:5]

array([0, 0, 0, 1, 1])

In [57]:
y_pred_proba = logit5.predict_proba(X_train5)
y_pred_proba[:5]

array([[0.56411743, 0.43588257],
       [0.88847363, 0.11152637],
       [0.86096645, 0.13903355],
       [0.30362208, 0.69637792],
       [0.06813351, 0.93186649]])

In [58]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit5.score(X_train5, y_train)))

Accuracy of KNN classifier on training set: 0.81


In [59]:
cm = confusion_matrix(y_train, y_pred )

TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
print(cm)
print('')
print(f'True Positive -> {TP}')
print(f'True Negative -> {TN}')
print(f'False Positive -> {FP}')
print(f'False Negative -> {FN}')

[[265  42]
 [ 54 137]]

True Positive -> 137
True Negative -> 265
False Positive -> 42
False Negative -> 54


In [60]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       307
           1       0.77      0.72      0.74       191

    accuracy                           0.81       498
   macro avg       0.80      0.79      0.79       498
weighted avg       0.81      0.81      0.81       498



In [61]:
compute_metrics(TN,FP,FN,TP)

Accuracy: 0.8072289156626506

True Positive Rate/Sensitivity/Recall/Power: 0.7172774869109948
False Positive Rate/False Alarm Ratio/Fall-out: 0.13680781758957655
True Negative Rate/Specificity/Selectivity: 0.8631921824104235
False Negative Rate/Miss Rate: 0.28272251308900526

Precision/PPV: 0.7653631284916201
F1 Score: 0.7405405405405405

Support (0): 191
Support (1): 307


### Use you best 3 models to predict and evaluate on your validate sample.



In [62]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))


Accuracy of KNN classifier on training set: 0.70


In [63]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit2.score(X_train2, y_train)))


Accuracy of KNN classifier on training set: 0.82


In [64]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit3.score(X_train3, y_train)))


Accuracy of KNN classifier on training set: 0.80


In [65]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit4.score(X_train4, y_train)))


Accuracy of KNN classifier on training set: 0.81


In [66]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(logit5.score(X_train5, y_train)))


Accuracy of KNN classifier on training set: 0.81


In [74]:
#best models in order
# 1 logit2 0.82
# 2 logit4 0.81
# 3 logit5 0.81

In [72]:
# Make prediction for validate dataset

y_pred_validate = logit2.predict(X_validate2)
y_pred_validate2 = logit4.predict(X_validate4)
y_pred_validate3 = logit5.predict(X_validate5)

In [73]:
print("Model 1:c = 1, randomstate = 123")

print('Accuracy: {:.2f}'.format(logit2.score(X_validate2, y_validate)))

print(classification_report(y_validate, y_pred_validate))

print('--------------------------------------------------')

print("Model 2:c = 1, randomstate = 123")

print('Accuracy: {:.2f}'.format(logit4.score(X_validate4, y_validate)))

print(classification_report(y_validate, y_pred_validate))

print("Model 3:c = 1, randomstate = 123")

print('Accuracy: {:.2f}'.format(logit5.score(X_validate5, y_validate)))

print(classification_report(y_validate, y_pred_validate))


Model 1:c = 1, randomstate = 123
Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214

--------------------------------------------------
Model 2:c = 1, randomstate = 123
Accuracy: 0.76
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.70        82

    accuracy                           0.78       214
   macro avg       0.76      0.76      0.76       214
weighted avg       0.77      0.78      0.77       214

Model 3:c = 1, randomstate = 123
Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       132
           1       0.72      0.67      0.7

### Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?



In [76]:
y_pred_test = logit5.predict(X_test5)

print('Accuracy: {:.2f}'.format(logit5.score(X_test5, y_test)))

print(classification_report(y_test, y_pred_test))

Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       110
           1       0.74      0.71      0.73        69

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



In [None]:
# model 3 performed the best 
# train 81 percent 
# validate 78 percent 
# test 79 percrnt 

# all the meterics are close to each other. 