In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,1,0.83,0,2,29.0,0,0,1
1,1,3,1,29.0,0,0,9.5,0,0,1
2,1,3,1,6.0,0,1,12.475,0,0,1
3,0,1,1,36.0,1,0,78.85,0,0,1
4,0,3,1,4.0,4,2,31.275,0,0,1


In [4]:
X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [5]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print({'accuracy': acc,
            'precision': prec,
            'recall': recall,
            'accuracy_count': num_acc})

cv=3: use 3 fold cross validation to find the best model - split the dataset into 3 parts
default is to use accuracy as cross validation evalation metrics

In [7]:
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 2}

In [9]:
for i in range(6):
    print("Parameters: ", grid_search.cv_results_['params'][i])
    print("Mean Test Score: ", grid_search.cv_results_['mean_test_score'][i])
    print("Rank: ", grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7855657662675206
Rank:  1
Parameters:  {'max_depth': 4}
Mean Test Score:  0.785547201336675
Rank:  2
Parameters:  {'max_depth': 5}
Mean Test Score:  0.7837556855100715
Rank:  3
Parameters:  {'max_depth': 7}
Mean Test Score:  0.782010582010582
Rank:  4
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7662118258609487
Rank:  5
Parameters:  {'max_depth': 10}
Mean Test Score:  0.7556762276060521
Rank:  6


In [10]:
decision_tree_model = DecisionTreeClassifier(
    max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [11]:
y_pred = decision_tree_model.predict(x_test)

In [12]:
summarize_classification(y_test, y_pred)

{'accuracy': 0.8041958041958042, 'precision': 1.0, 'recall': 0.5087719298245614, 'accuracy_count': 115}


In [13]:
parameters = {'penalty': ['l1', 'l2'],
              'C': [0.1, 0.4, 0.8, 1, 2, 5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 2, 'penalty': 'l1'}

In [15]:
for i in range(12):
    print("Parameters: ", grid_search.cv_results_['params'][i])
    print("Mean Test Score: ", grid_search.cv_results_['mean_test_score'][i])
    print("Rank: ", grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7468300380581082
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7573934837092732
Rank:  11
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7732572171168662
Rank:  7
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7609115381045205
Rank:  10
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7785203750116031
Rank:  6
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7697113153253504
Rank:  9
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7802654785110925
Rank:  4
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7732293697205979
Rank:  8
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.785547201336675
Rank:  1
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.782010582010582
Rank:  2
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7802747609765154
Rank:  3
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Sco

In [18]:
logistic_model = LogisticRegression(solver='liblinear',
    penalty = grid_search.best_params_['penalty'], C=grid_search.best_params_['C']).fit(x_train, y_train)

In [19]:
y_pred = logistic_model.predict(x_test)
summarize_classification(y_test, y_pred)

{'accuracy': 0.8671328671328671, 'precision': 0.8958333333333334, 'recall': 0.7543859649122807, 'accuracy_count': 124}
