In [1]:
## In this notebook I have trained the model by finding best working parameters.

In [19]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from vecstack import stacking
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score , f1_score , precision_score , recall_score


In [20]:
train = pd.read_csv(r"../Data/PreparedData/train.csv")
test = pd.read_csv(r"../Data/PreparedData/test.csv")

In [21]:
## first let's test the model on train data
X = train.drop(["Survived"] , axis = 1 )
y = train["Survived"]
x_train_org , x_test_org , y_train_org , y_test_org = train_test_split(X , y , test_size = 0.3 , random_state = 42)

In [23]:
RF_params =  {"criterion" :["entropy", "giin"] , "n_estimators" : range(100, 2000 , 200) , "min_samples_leaf" : range(1 , 10),
         "max_depth": range(4 , 20) }

RF = RandomForestClassifier( random_state = 42 , n_jobs = -1)

RF_grid = GridSearchCV(RF , RF_params , cv = 10 , n_jobs = -1 , verbose = 3)
RF_grid.fit(x_train_org , y_train_org)

Fitting 10 folds for each of 2880 candidates, totalling 28800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 28.6min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 37.4min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed: 42.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 45.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 49.8min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed: 54.2min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed: 84.6min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed: 89.7min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed: 233.5min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | 

GridSearchCV(cv=10,
             estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['entropy', 'giin'],
                         'max_depth': range(4, 20),
                         'min_samples_leaf': range(1, 10),
                         'n_estimators': range(100, 2000, 200)},
             verbose=3)

In [29]:
RF_grid.best_score_

0.8410138248847925

In [32]:
RF_grid.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=7, min_samples_leaf=6,
                       n_jobs=-1, random_state=42)

In [26]:
std = StandardScaler()
x_train_scl = std.fit_transform(x_train_org)

In [27]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
#Convert to dictionary
knn_params = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

## create pipeline
# knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_m = KNeighborsClassifier()
knn_grid = GridSearchCV(knn_m, knn_params,
                        cv=5, n_jobs=-1, verbose=True)


knn_grid.fit(x_train_org, y_train_org)

knn_grid.best_params_, knn_grid.best_score_

Fitting 5 folds for each of 2842 candidates, totalling 14210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 1536 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 2936 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 4736 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 6936 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done 9536 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 12536 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 14195 out of 14210 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 14210 out of 14210 | elapsed:  1.2min finished


({'leaf_size': 16, 'n_neighbors': 12, 'p': 1}, 0.8315483870967743)

In [30]:
knn_grid.best_score_

0.8315483870967743

In [31]:
knn_grid.best_estimator_

KNeighborsClassifier(leaf_size=16, n_neighbors=12, p=1)

In [28]:
# knn_params

In [34]:
models = [
         KNeighborsClassifier( n_jobs=-1 , leaf_size=16, n_neighbors=12, p=1),
         RandomForestClassifier( n_estimators=1000, criterion='entropy', max_depth=7, min_samples_leaf=6,
                       n_jobs=-1, random_state=42),
         XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=1000, max_depth=10) 
         ]

In [35]:
def Stacking(models , X_train , y_train , X_test):
    """
    This function will return new feature set which is the combination of given model prediction called stacking.
        
    Parameters
    ----------
    models : list, 
        This is list of models we want to stack.
        
    X_train : DataFrame
        Index:
            rangeindex
        columns:
            contains features of data
            
    y_train:
        index:
            rangeIndex
        columns:
            target variable
            
    X_test:
        Test data
        
    Returns
    -------
        new train and test set which are the predictions of given models
    """
    S_train, S_test = stacking(models, X_train, y_train, X_test, regression=False,mode='oof_pred_bag', 
                               needs_proba=False, save_dir=None, metric=accuracy_score, n_folds=4, 
                               stratified=True,shuffle=True,  random_state=0,  verbose=2)
    return S_train , S_test

In [36]:
S_train , S_test = Stacking(models= models , X_train= x_train_org , y_train=y_train_org , X_test=x_test_org)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.80128205]
    fold  1:  [0.84615385]
    fold  2:  [0.80769231]
    fold  3:  [0.85806452]
    ----
    MEAN:     [0.82829818] + [0.02428648]
    FULL:     [0.82825040]

model  1:     [RandomForestClassifier]
    fold  0:  [0.78846154]
    fold  1:  [0.87179487]
    fold  2:  [0.81410256]
    fold  3:  [0.83870968]
    ----
    MEAN:     [0.82826716] + [0.03077674]
    FULL:     [0.82825040]

model  2:     [XGBClassifier]
    fold  0:  [0.77564103]
    fold  1:  [0.85256410]
    fold  2:  [0.80769231]
    fold  3:  [0.83225806]
    ----
    MEAN:     [0.81703888] + [0.02870020]
    FULL:     [0.81701445]



In [37]:
## evaluate the model

In [38]:
xgb = XGBClassifier(random_state = 42 , n_jobs = -1 , n_estimators = 1000 , max_depth = 5)
xgb.fit(S_train , y_train_org)

# accuracy_score()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [39]:
accuracy_score(y_test_org , xgb.predict(S_test))

0.8283582089552238

In [40]:
f1_score(y_test_org , xgb.predict(S_test))

0.7722772277227723

In [41]:
precision_score(y_test_org , xgb.predict(S_test))

0.8571428571428571

In [42]:
recall_score(y_test_org , xgb.predict(S_test))

0.7027027027027027

In [23]:
#### we are getting good performances let's try it on test set

In [43]:
X_train = train.drop(["Survived"] , axis = 1)
y_train = train["Survived"]
X_test = test

In [45]:
S_train , S_test = Stacking(models , X_train= X_train , y_train= y_train , X_test = X_test )

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.84304933]
    fold  1:  [0.84304933]
    fold  2:  [0.76681614]
    fold  3:  [0.84234234]
    ----
    MEAN:     [0.82381429] + [0.03290916]
    FULL:     [0.82379349]

model  1:     [RandomForestClassifier]
    fold  0:  [0.86547085]
    fold  1:  [0.85201794]
    fold  2:  [0.77578475]
    fold  3:  [0.81081081]
    ----
    MEAN:     [0.82602109] + [0.03530992]
    FULL:     [0.82603816]

model  2:     [XGBClassifier]
    fold  0:  [0.84753363]
    fold  1:  [0.80717489]
    fold  2:  [0.79372197]
    fold  3:  [0.83333333]
    ----
    MEAN:     [0.82044096] + [0.02115490]
    FULL:     [0.82042649]



In [46]:
xgb = XGBClassifier(random_state = 42 , n_jobs = -1 , n_estimators = 1000 , max_depth = 5)
xgb.fit(S_train , y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [47]:
y_pred = xgb.predict(S_test)

In [48]:
submit = pd.read_csv(r"../Data/gender_submission.csv")

In [49]:
submit.columns

Index(['PassengerId', 'Survived'], dtype='object')

In [50]:
submitDf = pd.DataFrame({"PassengerId" : submit["PassengerId"] , "Survived" : y_pred})

In [51]:
submitDf.to_csv(r"../Data/Predictions/Stacking_v3.csv" , index = False)