In [1]:
# import required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

In [2]:
# grab the raw dataset
titanic = sns.load_dataset('titanic')

# remove columns we don't need
titanic = titanic.drop(['class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone'], axis=1)

# take care of missing age entries by filling them in with the median age
titanic['age'].fillna(titanic.groupby('pclass')['age'].transform("median"), inplace=True)

# drop rows with missing elements
titanic.dropna(inplace=True)

# Convert categorical variable into dummy/indicator variables (1's and 0's, easier to work with)
new_sex = pd.get_dummies(titanic['sex'], drop_first=True)

# Convert categorical variable into dummy/indicator variables (1's and 0's, easier to work with)
new_embarked = pd.get_dummies(titanic['embarked'], drop_first=True)

# add dummy variables (columns) to dataframe
titanic = pd.concat([titanic, new_sex, new_embarked], axis=1)

# drop original categorical variables (columns)
titanic.drop(['sex', 'embarked'],axis=1, inplace=True)

# show first 5 rows as a demo
titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,male,Q,S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


In [3]:
# library to split dataset into train and test
from sklearn.model_selection import train_test_split

# X is all possible features/independent variables
X = titanic.drop('survived', axis=1)
# Y is what we are modeling
Y = titanic['survived']

# split the data with 30% of rows going to the training set and a state seed of 1000 for consistency
trainX, testX, trainY, testY = train_test_split(X,Y,test_size=0.3, random_state = 1000)

# print the shape of the split dataframes (rows, columns)
trainX.shape, trainY.shape, testX.shape, testY.shape

# we started with 889 observations - 622 are for training and 267 for testing

((622, 8), (622,), (267, 8), (267,))

In [4]:
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# import library for logistic regression
from sklearn.linear_model import LogisticRegression

# the C parameter is for avoiding overfitting using penalty terms
# this is not needed for this project, so a very large value is used to 'disable' it
# build the model
logreg = LogisticRegression(C=1e10)

# fit the model with data
fit = logreg.fit(trainX, trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
#Question 1: Finding the optimal value

accuracyTemp = 0
maxAccuracy = 0
maxCutoff = 0
    
# for each possible cutoff value between 0 and 1 by increments of 0.01
for i in range(1, 100, 1):
    cutoff = i/100
    # make array of predictions for each row, either a 0 for death or 1 for survival
    # the [:,1] returns only prediction of survival which is compared to cutoff value to determine binary value
    prob_of_1 = (logreg.predict_proba(trainX)[:,1] >= cutoff).astype(bool)
    # find accuracy by dividing the number of correct predictions by the total number of records
    accuracyTemp = np.sum(trainY == prob_of_1)/622
    # if highest accuracy so far, replace accuracy and cutoff value in optimal
    if (accuracyTemp >= maxAccuracy):
        maxCutoff = cutoff
        maxAccuracy = accuracyTemp
        
#optimal cutoff
print("the optimal cutoff value is: " + str(maxCutoff) + " and the accuracy with this value is: " + str(maxAccuracy))


the optimal cutoff value is: 0.61 and the accuracy with this value is: 0.8344051446945338


In [6]:
#Question 2: use optimal cutoff value to see how the model performs on the test data

# default cutoff accuracy
print("Default accuracy with a cutoff value of 0.5: ", fit.score(testX, testY))

# custom cutoff accuracy
prob_of_1 = (logreg.predict_proba(testX)[:,1] >= 0.61).astype(bool)
print("Accuracy for cutoff of 0.61: ", np.sum(testY == prob_of_1)/267, "\n")

# note the slightly improved accuracy when using the custom cutoff
# there is also a decline of 4% from training to testing

Default accuracy with a cutoff value of 0.5:  0.7790262172284644
Accuracy for cutoff of 0.61:  0.7902621722846442 



In [7]:
# custom cutoff confusion matrix
metrics.confusion_matrix(testY, prob_of_1)

array([[144,  18],
       [ 38,  67]], dtype=int64)

In [8]:
# default cutoff confusion matrix
metrics.confusion_matrix(testY, fit.predict(testX))

# the default cutoff has more false positives and less false negatives

array([[138,  24],
       [ 35,  70]], dtype=int64)