In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier 
from xgboost import XGBClassifier


In [2]:
cwd= os.getcwd() # current working directory
path = os.path.join(cwd,'data\\titanic') 

df = pd.read_csv(os.path.join(path, 'titanic_cleaned.csv'))
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Family_size,Survived.1,Sex,Cabin_availability
0,0,3,22.0,7.25,1,0,0,0
1,1,1,38.0,71.2833,1,1,1,1
2,1,3,26.0,7.925,0,1,1,0
3,1,1,35.0,53.1,1,1,1,1
4,0,3,35.0,8.05,0,0,0,0


In [3]:
features = df.drop('Survived', axis=1)
labels = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42) #create validation set

In [4]:
for data in [X_train, X_val, X_test]:
    print(round(len(data) / len(labels), 2))

0.6
0.2
0.2


In [None]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler


def run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test= None):
#     print ('Running GridSearchCV.')
#     grid_clf = GridSearchCV(clf, param_grid=grid_values,scoring='roc_auc')
#     grid_clf.fit(X_train_scaled, y_train)
#     print('Grid best parameter (max.roc_auc ): ', grid_clf.best_params_) 
#     print('Grid best score (roc_auc): ', grid_clf.best_score_) 

    # if not y_test is None:
    #     test_score= grid_clf.score(X_test_scaled, y_test)
    #     print("test roc_auc= {}".format(test_score))

    print ('Running GridSearchCV.')
    grid_clf = GridSearchCV(clf, param_grid=grid_values,scoring='f1')
    grid_clf.fit(X_train_scaled, y_train)
    print('Grid best parameter (max f1 ): ', grid_clf.best_params_) 
    print('Grid best score (f1): ', grid_clf.best_score_) 

    if not y_test is None:
        test_score= grid_clf.score(X_test_scaled, y_test)
        print("test f1= {}".format(test_score))


def run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test=None, list_classifiers= None):
    if list_classifiers is None or 'LogisticRegression' in list_classifiers:
        print ('\nLogisticRegression.')
        clf = LogisticRegression(max_iter=10000)
        grid_values = {'C': [0.005, 0.01,0.1, 1, 100, 10000, 100000]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled,  y_train,  y_test= y_test)

    if list_classifiers is None or 'DecisionTreeClassifier' in list_classifiers:
        print ('\nDecisionTreeClassifier')
        clf = DecisionTreeClassifier()       
        grid_values = {'max_depth': [2,5,7, 20, 50]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'RandomForestClassifier' in list_classifiers:
        print ('\nRandomForestClassifier.')
        clf = RandomForestClassifier()       
        grid_values = {'n_estimators': [5,20,50,100], 'max_depth': [2, 12, 24, None]} 
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_poly' in list_classifiers:
        print ('\nSVC_poly')
        clf = SVC(kernel='poly')           
        grid_values = {'C': [0.01]}# , 0.1, 1, 100, ]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_rbf' in list_classifiers:
        print ('\nSVC_rbf')
        clf = SVC(kernel='rbf')
        grid_values = {'C': [0.005, 0.01]}# , 0.02, 0.03, 0.1, 1, 100, 10000], 'gamma':[0.001, 0.01, 0.1]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'NB' in list_classifiers:
        print ('\nNB')
        clf =  GaussianNB().fit(X_train_scaled, y_train)
        train_f1 = f1_score(y_train, clf.predict(X_train_scaled))
        print("train set f1= {}".format(train_f1))
        if not y_test is None:
            test_f1 = f1_score(y_test, clf.predict(X_test_scaled))
            print("train set f1= {}".format(test_f1))

            
    if list_classifiers is None or 'GradientBoostingClassifier' in list_classifiers:
        print ('\nGradientBoostingClassifier.')
        clf = GradientBoostingClassifier() # learning_rate = 0.03)       
        grid_values = {'max_depth': [3,5,7]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'MLP' in list_classifiers:
        print ('\nMLP.')
        clf = MLPClassifier(hidden_layer_sizes = [50]) #, 100])
        grid_values = {'alpha' : [0.001, 0.01, 0.1, 1, 10]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'xgboost' in list_classifiers:
        print ('\nxgboost.')
        clf = XGBClassifier().fit(X_train_scaled, y_train)
        y_predicted = clf.predict(X_test_scaled)
        print ('f1_score  = {:.2}'.format(f1_score(y_test, y_predicted)))
        
def normalize(X_train, X_test):
    print ('normalizing.')
    scaler= MinMaxScaler()
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled 

In [6]:
X_train_scaled, X_test_scaled = normalize(X_train, X_test)
run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test=y_test, list_classifiers=None)

normalizing.

LogisticRegression.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'C': 0.1}
Grid best score (f1):  1.0
test f1= 1.0

DecisionTreeClassifier
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 2}
Grid best score (f1):  1.0
test f1= 1.0

RandomForestClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'n_estimators': 5}
Grid best score (f1):  1.0
test f1= 1.0

SVC_poly
Running GridSearchCV.
Grid best parameter (max f1 ):  {'C': 0.01}
Grid best score (f1):  0.9879457707043914
test f1= 0.9946524064171123

SVC_rbf
Running GridSearchCV.
Grid best parameter (max f1 ):  {'C': 0.01}
Grid best score (f1):  0.9966386554621849
test f1= 1.0

NB
train set f1= 1.0
train set f1= 1.0

GradientBoostingClassifier.
Running GridSearchCV.
Grid best parameter (max f1 ):  {'max_depth': 3}
Grid best score (f1):  1.0
test f1= 1.0

MLP.
Running GridSearchCV.




Grid best parameter (max f1 ):  {'alpha': 0.001}
Grid best score (f1):  1.0
test f1= 1.0

xgboost.
f1_score  = 1.0


##### Conclusion: Let's exploit the RandomForestClassifier. {'n_estimators': 5}

In [12]:
clf = RandomForestClassifier(n_estimators=5)
clf.fit(X_train_scaled, y_train)
X_val_scaled = MinMaxScaler().fit_transform(X_val)  # Normalize validation set
y_val_predicted = clf.predict(X_val_scaled)

print(classification_report(y_val, y_val_predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       157
           1       1.00      1.00      1.00       105

    accuracy                           1.00       262
   macro avg       1.00      1.00      1.00       262
weighted avg       1.00      1.00      1.00       262



In [13]:
y_test_predicted = clf.predict(X_test_scaled)

print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       168
           1       1.00      0.99      0.99        94

    accuracy                           1.00       262
   macro avg       1.00      0.99      1.00       262
weighted avg       1.00      1.00      1.00       262

