In [1]:
# DS Libraries
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# knn submodules from scikit learn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.linear_model import LogisticRegression

# Data Acquisition
from pydataset import data
import env
import acquire as acq
import prepare as prp

In [2]:
# 1 Fit a LogisticRegression classifier to the TRAIN data
# load titanic via acquire.py
df = acq.get_titanic_data('titanic_db')
df = prp.prep_titanic(df)
df.head()
# Split dataset
train, validate, test = prp.split_data(df,'survived')
train.shape,validate.shape,test.shape
#Determine drivers of target
train.columns[:-2]
cat_cols, num_cols = [], []
for col in train.columns[:-2]:
    if train[col].dtype == 'O':
        cat_cols.append(col)
    else:
        if train[col].nunique() < 10:
            cat_cols.append(col)
        else:
            num_cols.append(col)

In [3]:
cat_cols,num_cols

(['survived',
  'pclass',
  'sex',
  'sibsp',
  'parch',
  'embark_town',
  'alone',
  'sex_male'],
 ['age', 'fare'])

In [4]:
explore_cols = cat_cols + num_cols
explore_cols
train.survived.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [5]:
train['baseline_pred'] = 0
train.head(10)
pd.crosstab(train.baseline_pred,train.survived) 

survived,0,1
baseline_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,307,191


In [6]:
# Baseline ACCURACY is 61.6%  
baseline_acc = (train.baseline_pred == train.survived).mean()
baseline_acc

0.6164658634538153

### 1 Create a Model with Age,Fare, and pclass

In [7]:
X_cols = train.columns.to_list()
X_cols.remove('survived')
X_cols.remove('baseline_pred')
X_cols.remove('sex')
X_cols.remove('sibsp')
X_cols.remove('parch')
X_cols.remove('embark_town')
X_cols.remove('alone')
X_cols.remove('embark_town_Queenstown')
X_cols.remove('embark_town_Southampton')

y_cols = 'survived'
X_cols.remove('sex_male')
X_cols

['pclass', 'age', 'fare']

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 542 to 522
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   survived                 498 non-null    int64  
 1   pclass                   498 non-null    int64  
 2   sex                      498 non-null    object 
 3   age                      498 non-null    float64
 4   sibsp                    498 non-null    int64  
 5   parch                    498 non-null    int64  
 6   fare                     498 non-null    float64
 7   embark_town              498 non-null    object 
 8   alone                    498 non-null    int64  
 9   sex_male                 498 non-null    uint8  
 10  embark_town_Queenstown   498 non-null    uint8  
 11  embark_town_Southampton  498 non-null    uint8  
 12  baseline_pred            498 non-null    int64  
dtypes: float64(2), int64(6), object(2), uint8(3)
memory usage: 44.3+ KB


In [9]:
X_train = train[X_cols]
y_train = train['survived']

X_validate = validate[X_cols]
y_validate = validate['survived']

X_test = test[X_cols]
y_test = test['survived']

X_train.head()

Unnamed: 0,pclass,age,fare
542,3,11.0,31.275
457,1,29.699118,51.8625
205,3,2.0,10.4625
208,3,16.0,7.75
485,3,29.699118,25.4667


In [10]:
logit = LogisticRegression(penalty='l2',solver='lbfgs')
logit.fit(X_train,y_train)
y_pred = logit.predict(X_train)

In [11]:
y_pred

array([0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [12]:
logit_score = logit.score(X_train,y_train)
logit_score

0.7048192771084337

In [13]:
pd.crosstab(y_pred,y_train) # see the actual survived status is now columns

survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,265,105
1,42,86


In [14]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.86      0.78       307
           1       0.67      0.45      0.54       191

    accuracy                           0.70       498
   macro avg       0.69      0.66      0.66       498
weighted avg       0.70      0.70      0.69       498



In [15]:
print(f'''
Performance in accuracy of LOGIT on training data:
Accuracy (train): {logit.score(X_train, y_train)}
Accuracy (validate): {round(logit.score(X_validate, y_validate), 4)}''')


Performance in accuracy of LOGIT on training data:
Accuracy (train): 0.7048192771084337
Accuracy (validate): 0.6963


### 2 Create a Model with Age,Fare, and pclass and ADD Sex_Male

In [36]:
X_cols2 = train.columns.to_list()
X_cols2.remove('survived')
X_cols2.remove('baseline_pred')
X_cols2.remove('sex')
X_cols2.remove('sibsp')
X_cols2.remove('parch')
X_cols2.remove('embark_town')
X_cols2.remove('alone')
X_cols2.remove('embark_town_Queenstown')
X_cols2.remove('embark_town_Southampton')
y_cols = 'survived'
X_cols2

['pclass', 'age', 'fare', 'sex_male']

In [17]:
X_train = train[X_cols2]
y_train = train['survived']

X_validate = validate[X_cols2]
y_validate = validate['survived']

X_test = test[X_cols2]
y_test = test['survived']

X_train.head()

Unnamed: 0,pclass,age,fare,sex_male
542,3,11.0,31.275,0
457,1,29.699118,51.8625,0
205,3,2.0,10.4625,0
208,3,16.0,7.75,0
485,3,29.699118,25.4667,0


In [18]:
logit2 = LogisticRegression(penalty='l2',solver='lbfgs')
logit2.fit(X_train,y_train)
y_pred = logit2.predict(X_train)

In [19]:
logit_score2 = logit2.score(X_train,y_train)
logit_score2

0.7971887550200804

In [20]:
pd.crosstab(y_pred,y_train) # see the actual survived status is now columns

survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,259,53
1,48,138


In [21]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.84       307
           1       0.74      0.72      0.73       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



In [22]:
print(f'''
Performance in accuracy of LOGIT2 on training data:
Accuracy (train): {logit2.score(X_train, y_train)}
Accuracy (validate): {round(logit2.score(X_validate, y_validate), 4)}''')


Performance in accuracy of LOGIT2 on training data:
Accuracy (train): 0.7971887550200804
Accuracy (validate): 0.757


### 3 Try other combos of Features and Models: will add col 'alone'

In [23]:
X_cols3 = train.columns.to_list()
X_cols3.remove('survived')
X_cols3.remove('baseline_pred')
X_cols3.remove('sex')
X_cols3.remove('sibsp')
X_cols3.remove('parch')
X_cols3.remove('embark_town')
X_cols3.remove('embark_town_Queenstown')
X_cols3.remove('embark_town_Southampton')
y_cols = 'survived'
X_cols3

['pclass', 'age', 'fare', 'alone', 'sex_male']

In [24]:
X_train = train[X_cols3]
y_train = train['survived']

X_validate = validate[X_cols3]
y_validate = validate['survived']

X_test = test[X_cols3]
y_test = test['survived']

X_train.head()

Unnamed: 0,pclass,age,fare,alone,sex_male
542,3,11.0,31.275,0,0
457,1,29.699118,51.8625,0,0
205,3,2.0,10.4625,0,0
208,3,16.0,7.75,1,0
485,3,29.699118,25.4667,0,0


In [25]:
logit3 = LogisticRegression(penalty='l2',solver='lbfgs')
logit3.fit(X_train,y_train)
y_pred = logit3.predict(X_train)

In [26]:
logit3_score3 = logit3.score(X_train,y_train)
logit3_score3

0.7951807228915663

In [27]:
pd.crosstab(y_pred,y_train) # see the actual survived status is now columns

survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,256,51
1,51,140


In [28]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       307
           1       0.73      0.73      0.73       191

    accuracy                           0.80       498
   macro avg       0.78      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



In [29]:
print(f'''
Performance in accuracy of LOGIT3 on training data:
Accuracy (train): {logit3.score(X_train, y_train)}
Accuracy (validate): {round(logit3.score(X_validate, y_validate), 4)}''')


Performance in accuracy of LOGIT3 on training data:
Accuracy (train): 0.7951807228915663
Accuracy (validate): 0.7523


### 4 EVALUATE TOP 3 on Validate

In [32]:
X_train = train[X_cols]
y_train = train['survived']

X_validate = validate[X_cols]
y_validate = validate['survived']

X_test = test[X_cols]
y_test = test['survived']

print(f'''
Performance in accuracy of LOGIT on training data:
Accuracy (train): {logit.score(X_train, y_train)}
Accuracy (validate): {round(logit.score(X_validate, y_validate), 4)}''')


Performance in accuracy of LOGIT on training data:
Accuracy (train): 0.7048192771084337
Accuracy (validate): 0.6963


In [37]:
X_train = train[X_cols2]
y_train = train['survived']

X_validate = validate[X_cols2]
y_validate = validate['survived']

X_test = test[X_cols2]
y_test = test['survived']

print(f'''
Performance in accuracy of LOGIT2 on training data:
Accuracy (train): {logit2.score(X_train, y_train)}
Accuracy (validate): {round(logit2.score(X_validate, y_validate), 4)}''')


Performance in accuracy of LOGIT2 on training data:
Accuracy (train): 0.7971887550200804
Accuracy (validate): 0.757


In [38]:
X_train = train[X_cols3]
y_train = train['survived']

X_validate = validate[X_cols3]
y_validate = validate['survived']

X_test = test[X_cols3]
y_test = test['survived']

print(f'''
Performance in accuracy of LOGIT3 on training data:
Accuracy (train): {logit3.score(X_train, y_train)}
Accuracy (validate): {round(logit3.score(X_validate, y_validate), 4)}''')


Performance in accuracy of LOGIT3 on training data:
Accuracy (train): 0.7951807228915663
Accuracy (validate): 0.7523


### 5 EVALUATE BEST on TEST ... Logit 1 has the least drop-off from Train to Validate, However, even with the slight overfit of LOGIT 3, it has the highest VALIDATE accuracy score of the three options.

In [42]:
print(f'''
Performance in accuracy of LOGIT3 on training and test data:
Accuracy (train): {logit3.score(X_train, y_train)}
Accuracy (validate): {round(logit3.score(X_validate, y_validate), 4)}
Accuracy (TEST): {round(logit3.score(X_test, y_test), 4)}''')


Performance in accuracy of LOGIT3 on training and test data:
Accuracy (train): 0.7951807228915663
Accuracy (validate): 0.7523
Accuracy (TEST): 0.8045


In [40]:
# Test accuracy score similar to train