In [16]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [17]:
titanic_data = pd.read_csv('datasets/titanic_train_processes.csv')
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,2,1,52.0,0,0,13.5,0,0,1
1,0,1,1,24.0,0,0,79.2,1,0,0
2,1,1,0,36.0,1,2,120.0,0,0,1
3,0,1,1,28.0,0,0,47.1,0,0,1
4,0,3,1,36.0,0,0,7.8958,0,0,1


In [18]:
X = titanic_data.drop('Survived', axis = 1)
Y = titanic_data['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [19]:
def summary(y_test, y_pred):
    #This value will be between 0.0 and 1.0
    acc = accuracy_score(y_test, y_pred, normalize=True) 
    #The number of correctly predicted samples (raw count)
    acc_num = accuracy_score(y_test, y_pred, normalize=False) 

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f'Test data count: {len(y_test)}')
    print(f'Accuracy_count: {acc_num}')
    print(f'Accuracy score: {acc}')
    print(f'Precision: {precision}')
    print(f'Recall : {recall}')
    print()

#### Grid Search for Decision Tree

In [20]:
from sklearn.model_selection import GridSearchCV 
#grid search cross validation

parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
#GS builds different models one for each of parameter. 6 models here.
# uses 3 fold cross validation to find best model.

grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 4}

##### max_depth = 4 performs best.

In [21]:
for i in range(6):
    print(f'parameters : {grid_search.cv_results_["params"][i]}')
    print(f'Mean Test Score: {grid_search.cv_results_["mean_test_score"][i]}')
    print(f'Rank : {grid_search.cv_results_["rank_test_score"][i]} ')

parameters : {'max_depth': 2}
Mean Test Score: 0.7705428492697713
Rank : 4 
parameters : {'max_depth': 4}
Mean Test Score: 0.812565445026178
Rank : 1 
parameters : {'max_depth': 5}
Mean Test Score: 0.7950491411775512
Rank : 2 
parameters : {'max_depth': 7}
Mean Test Score: 0.7582897033158814
Rank : 6 
parameters : {'max_depth': 9}
Mean Test Score: 0.7740516211995958
Rank : 3 
parameters : {'max_depth': 10}
Mean Test Score: 0.7653164324423626
Rank : 5 


In [23]:
# using the above data, we can build the model using that parameter.

decision_tree = DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [24]:
y_pred = decision_tree.predict(x_test)

In [25]:
summary(y_test, y_pred)

Test data count: 143
Accuracy_count: 113
Accuracy score: 0.7902097902097902
Precision: 0.7083333333333334
Recall : 0.68



### Gris search for Logistic regression

In [26]:
parameters = {
    'penalty': ['l1', 'l2'],
    'C':[0.1, 0.4, 0.8, 1, 2, 5]}

    # 2* 5 = 10 models.

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 2, 'penalty': 'l1'}

In [27]:
for i in range(12):
    print(f'parameters : {grid_search.cv_results_["params"][i]}')
    print(f'Mean Test Score: {grid_search.cv_results_["mean_test_score"][i]}')
    print(f'Rank : {grid_search.cv_results_["rank_test_score"][i]} ')

parameters : {'C': 0.1, 'penalty': 'l1'}
Mean Test Score: 0.7775420225957564
Rank : 11 
parameters : {'C': 0.1, 'penalty': 'l2'}
Mean Test Score: 0.7705336640029392
Rank : 12 
parameters : {'C': 0.4, 'penalty': 'l1'}
Mean Test Score: 0.7845503811885735
Rank : 9 
parameters : {'C': 0.4, 'penalty': 'l2'}
Mean Test Score: 0.7845503811885735
Rank : 9 
parameters : {'C': 0.8, 'penalty': 'l1'}
Mean Test Score: 0.7880499678515661
Rank : 6 
parameters : {'C': 0.8, 'penalty': 'l2'}
Mean Test Score: 0.7863047671534859
Rank : 7 
parameters : {'C': 1, 'penalty': 'l1'}
Mean Test Score: 0.793313125746303
Rank : 5 
parameters : {'C': 1, 'penalty': 'l2'}
Mean Test Score: 0.7863047671534859
Rank : 7 
parameters : {'C': 2, 'penalty': 'l1'}
Mean Test Score: 0.7985854689078717
Rank : 1 
parameters : {'C': 2, 'penalty': 'l2'}
Mean Test Score: 0.7968127124092955
Rank : 4 
parameters : {'C': 5, 'penalty': 'l1'}
Mean Test Score: 0.7968310829429596
Rank : 2 
parameters : {'C': 5, 'penalty': 'l2'}
Mean Test Sco

In [28]:
lr = LogisticRegression(solver='liblinear', penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C'])

In [29]:
lr.fit(x_train, y_train)

In [30]:
y_pred=lr.predict(x_test)

In [31]:
summary(y_test, y_pred)

Test data count: 143
Accuracy_count: 111
Accuracy score: 0.7762237762237763
Precision: 0.68
Recall : 0.68

