## Setting functions 

In [84]:
def train_preprocessing(df):# Setting gender 1 - male, 0 female
    df['Sex'] = df['Sex'].map({'male' : 1, 'female' : 0})
    # Setting landing place Southampton - 1, Cherbourg - 2, Queenstown - 3
    df['Embarked'] = df['Embarked'].map({'S' : 1, 'С' : 2, 'Q' : 3}).fillna(value = 1)
    # Fill Nan values average values
    df.fillna({'Age' : df['Age'].mean()}, inplace = True) 
    # Changing type columns to int64
    df[['Age', 'Fare', 'Embarked']] = df[['Age', 'Fare', 'Embarked']].astype(int)
    print('Preprocessing successfully completed')
    return df

In [85]:
def test_preprocessing(df):
# Setting gender 1 - male, 0 female
    df['Sex'] = df['Sex'].map({'male' : 1, 'female' : 0}).fillna(1)
    # Setting landing place Southampton - 1, Cherbourg - 2, Queenstown - 3
    df['Embarked'] = df['Embarked'].map({'S' : 1, 'С' : 2, 'Q' : 3}).fillna(value = 1)
    # Fill Nan values average values
    df.fillna({'Age' : df['Age'].mean()}, inplace = True) 
    df.fillna({'Fare' : df['Fare'].mean()}, inplace = True)
    # Changing type columns to int64
    df[['Age', 'Fare', 'Embarked']] = df[['Age', 'Fare', 'Embarked']].astype(int)
    print('Preprocessing successfully completed')
    return df

In [86]:
# Normalize X_train X_test
def normalize(X_train, X_test):
    print ('normalizing.')
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print('Preprocessing successfully completed')
    return X_train_scaled, X_test_scaled 

In [27]:
# Running all classifiers
def run_all_classifiers(X_train_scaled, X_test_scaled, y_train = None, y_test=None, list_classifiers= None):
    if list_classifiers is None or 'LogisticRegression' in list_classifiers:
        print ('\nLogisticRegression.')
        clf = LogisticRegression(max_iter=10000)
        grid_values = {'C': [0.005, 0.01,0.1, 1, 100, 10000, 100000]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled,  y_train,  y_test= y_test)

    if list_classifiers is None or 'DecisionTreeClassifier' in list_classifiers:
        print ('\nDecisionTreeClassifier')
        clf = DecisionTreeClassifier()       
        grid_values = {'max_depth': [2,5,7, 20, 50]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'RandomForestClassifier' in list_classifiers:
        print ('\nRandomForestClassifier.')
        clf = RandomForestClassifier()       
        grid_values = {'n_estimators': [20,50]} #,200,300]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_poly' in list_classifiers:
        print ('\nSVC_poly')
        clf = SVC(kernel='poly')           
        grid_values = {'C': [0.01]}# , 0.1, 1, 100, ]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'SVC_rbf' in list_classifiers:
        print ('\nSVC_rbf')
        clf = SVC(kernel='rbf')
        grid_values = {'C': [0.005, 0.01]}# , 0.02, 0.03, 0.1, 1, 100, 10000], 'gamma':[0.001, 0.01, 0.1]}
        run_GridSearchCV(clf,grid_values, X_train_scaled, X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'NB' in list_classifiers:
        print ('\nNB')
        clf =  GaussianNB().fit(X_train_scaled, y_train)
        train_f1 = f1_score(y_train, clf.predict(X_train_scaled))
        print("train set f1= {}".format(train_f1))
        if not y_test is None:
            test_f1 = f1_score(y_test, clf.predict(X_test_scaled))
            print("train set f1= {}".format(test_f1))

            
    if list_classifiers is None or 'GradientBoostingClassifier' in list_classifiers:
        print ('\nGradientBoostingClassifier.')
        clf = GradientBoostingClassifier() # learning_rate = 0.03)       
        grid_values = {'max_depth': [3,5,7]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'MLP' in list_classifiers:
        print ('\nMLP.')
        clf = MLPClassifier(hidden_layer_sizes = [50]) #, 100])
        grid_values = {'alpha' : [0.001, 0.01, 0.1, 1, 10]}
        run_GridSearchCV(clf,grid_values, X_train_scaled,X_test_scaled, y_train,  y_test= y_test)   

    if list_classifiers is None or 'xgboost' in list_classifiers:
        print ('\nxgboost.')
        clf = XGBClassifier().fit(X_train_scaled, y_train, eval_metric = 'logloss')
        y_predicted = clf.predict(X_test_scaled)
        print ('f1_score  = {:.2}'.format(f1_score(y_test, y_predicted)))

In [157]:
def run_GridSearchCV(clf, grid_values, X_train_scaled, X_test_scaled, y_train, y_test= None):
    print ('Running GridSearchCV.')
    grid_clf = GridSearchCV(clf, param_grid=grid_values,scoring='roc_auc')
    grid_clf.fit(X_train_scaled, y_train)
    print('Grid best parameter (max.roc_auc ): ', grid_clf.best_params_) 
    print('Grid best score (roc_auc): ', grid_clf.best_score_) 

    if not y_test is None:
        test_score= grid_clf.score(X_test_scaled, y_test)
        print("test roc_auc= {}".format(test_score))

 

In [87]:
# List of classifiers to use
list_classifiers= [
    'LogisticRegression',
    'DecisionTreeClassifier',
'RandomForestClassifier',
    'NB',
'GradientBoostingClassifier', 
    'MLP', 
    'xgboost']

## Import Frameworks

In [88]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np 

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier 

## Load Data

In [170]:
titanic_test = pd.read_csv('./data/test.csv',
                           index_col = 'PassengerId',
                           usecols = ['PassengerId', 'Pclass', 'Sex',
                                       'Age', 'SibSp', 'Parch','Fare', 'Embarked'])
titanic_submission = pd.read_csv('./data/gender_submission.csv')
titanic_train = pd.read_csv('./data/train.csv',
                            index_col = 'PassengerId',
                            usecols = ['PassengerId','Survived', 'Pclass', 'Sex',
                                       'Age', 'SibSp', 'Parch','Fare', 'Embarked'])
titanic_train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


# Preprocessing

In [173]:
titanic_train = train_preprocessing(titanic_train)
titanic_test = test_preprocessing(titanic_test)


Preprocessing successfully completed
Preprocessing successfully completed


In [174]:
titanic_train.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

# Division into train and testing sets and predict

In [175]:
y = titanic_train['Survived']
X = titanic_train.drop(columns = 'Survived')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 4381)

# Normalize data
X_train_scaled, X_test_scaled = normalize(X_train, X_test)

#Running all classifiers to comparsion result
run_all_classifiers(X_train_scaled, X_test_scaled, y_train, y_test, list_classifiers)

normalizing.
Preprocessing successfully completed

LogisticRegression.
Running GridSearchCV.
Grid best parameter (max.roc_auc ):  {'C': 10000}
Grid best score (roc_auc):  0.8425578349033407
test roc_auc= 0.8740264797507789

DecisionTreeClassifier
Running GridSearchCV.
Grid best parameter (max.roc_auc ):  {'max_depth': 5}
Grid best score (roc_auc):  0.8245890553474823
test roc_auc= 0.8315160955347872

RandomForestClassifier.
Running GridSearchCV.
Grid best parameter (max.roc_auc ):  {'n_estimators': 50}
Grid best score (roc_auc):  0.8316174384292362
test roc_auc= 0.8570223260643822

NB
train set f1= 0.7058823529411765
train set f1= 0.732394366197183

GradientBoostingClassifier.
Running GridSearchCV.
Grid best parameter (max.roc_auc ):  {'max_depth': 3}
Grid best score (roc_auc):  0.8545870691938108
test roc_auc= 0.8853842159916927

MLP.
Running GridSearchCV.
Grid best parameter (max.roc_auc ):  {'alpha': 0.1}
Grid best score (roc_auc):  0.8508996803238376
test roc_auc= 0.876233125649013

In [178]:
# GradientBoostingClassifier
clf = GradientBoostingClassifier() # learning_rate = 0.03)       
grid_values = {'max_depth': [3,5,7]}

grid_clf = GridSearchCV(clf, param_grid = grid_values,scoring = 'roc_auc')
grid_clf.fit(X, y)

predict = grid_clf.predict(titanic_test)
print('Grid best parameter (max.roc_auc ): ', grid_clf.best_params_) 
print('Grid best score (roc_auc): ', grid_clf.best_score_) 


Grid best parameter (max.roc_auc ):  {'max_depth': 3}
Grid best score (roc_auc):  0.8631204943448896


In [185]:
from sklearn.model_selection import GridSearchCV
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42,
    eval_metric = 'logloss'
)
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)
grid_search.fit(X, y)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None

In [190]:
predict = grid_search.predict(titanic_test)
predict
# print ('f1_score  = {:.2}'.format(f1_score(y, y_predicted)))

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [191]:
d = {'PassengerId': titanic_test.index, 'Survived': predict}
result_df = pd.DataFrame(data = d)
result_df.set_index('PassengerId')

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
...,...
1305,0
1306,1
1307,0
1308,0


In [192]:
result_df.to_csv('camp2021.csv', index=False)
