In [6]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from vecstack import stacking

from sklearn.metrics import accuracy_score , f1_score , precision_score , recall_score


In [7]:
train = pd.read_csv(r"../Data/PreparedData/train.csv")
test = pd.read_csv(r"../Data/PreparedData/test.csv")

In [9]:
## first let's test the model on train data
X = train.drop(["Survived"] , axis = 1 )
y = train["Survived"]
x_train_org , x_test_org , y_train_org , y_test_org = train_test_split(X , y , test_size = 0.3 , random_state = 42)

In [11]:
models = [
         KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
         RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=1000, max_depth=3),
         XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=1000, max_depth=3) 
         ]

In [25]:
def Stacking(models , X_train , y_train , X_test):
    """
    This function will return new feature set which is the combination of given model prediction called stacking.
        
    Parameters
    ----------
    models : list, 
        This is list of models we want to stack.
        
    X_train : DataFrame
        Index:
            rangeindex
        columns:
            contains features of data
            
    y_train:
        index:
            rangeIndex
        columns:
            target variable
            
    X_test:
        Test data
        
    Returns
    -------
        new train and test set which are the predictions of given models
    """
    S_train, S_test = stacking(models, X_train, y_train, X_test, regression=False,mode='oof_pred_bag', 
                               needs_proba=False, save_dir=None, metric=accuracy_score, n_folds=4, 
                               stratified=True,shuffle=True,  random_state=0,  verbose=2)
    return S_train , S_test

In [14]:
S_train , S_test = Stacking(models= models , X_train= x_train_org , y_train=y_train_org , X_test=x_test_org)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.77564103]
    fold  1:  [0.83333333]
    fold  2:  [0.80128205]
    fold  3:  [0.84516129]
    ----
    MEAN:     [0.81385443] + [0.02728496]
    FULL:     [0.81380417]

model  1:     [RandomForestClassifier]
    fold  0:  [0.80128205]
    fold  1:  [0.86538462]
    fold  2:  [0.80769231]
    fold  3:  [0.82580645]
    ----
    MEAN:     [0.82504136] + [0.02496827]
    FULL:     [0.82504013]

model  2:     [XGBClassifier]
    fold  0:  [0.77564103]
    fold  1:  [0.82051282]
    fold  2:  [0.80769231]
    fold  3:  [0.81935484]
    ----
    MEAN:     [0.80580025] + [0.01812005]
    FULL:     [0.80577849]



In [15]:
## evaluate the model

In [17]:
xgb = XGBClassifier(random_state = 42 , n_jobs = -1 , n_estimators = 1000 , max_depth = 5)
xgb.fit(S_train , y_train_org)

# accuracy_score()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [18]:
accuracy_score(y_test_org , xgb.predict(S_test))

0.835820895522388

In [20]:
f1_score(y_test_org , xgb.predict(S_test))

0.7962962962962963

In [21]:
precision_score(y_test_org , xgb.predict(S_test))

0.819047619047619

In [22]:
recall_score(y_test_org , xgb.predict(S_test))

0.7747747747747747

In [23]:
#### we are getting good performances let's try it on test set

In [24]:
X_train = train.drop(["Survived"] , axis = 1)
y_train = train["Survived"]
X_test = test

In [28]:
S_train , S_test = Stacking(models , X_train= X_train , y_train= y_train , X_test = X_test )

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.82959641]
    fold  1:  [0.83408072]
    fold  2:  [0.78475336]
    fold  3:  [0.83333333]
    ----
    MEAN:     [0.82044096] + [0.02067416]
    FULL:     [0.82042649]

model  1:     [RandomForestClassifier]
    fold  0:  [0.85650224]
    fold  1:  [0.86995516]
    fold  2:  [0.75336323]
    fold  3:  [0.80630631]
    ----
    MEAN:     [0.82153173] + [0.04595248]
    FULL:     [0.82154882]

model  2:     [XGBClassifier]
    fold  0:  [0.85201794]
    fold  1:  [0.81614350]
    fold  2:  [0.78026906]
    fold  3:  [0.81081081]
    ----
    MEAN:     [0.81481033] + [0.02547194]
    FULL:     [0.81481481]



In [29]:
xgb = XGBClassifier(random_state = 42 , n_jobs = -1 , n_estimators = 1000 , max_depth = 5)
xgb.fit(S_train , y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [30]:
y_pred = xgb.predict(S_test)

In [31]:
submit = pd.read_csv(r"../Data/gender_submission.csv")

In [32]:
submit.columns

Index(['PassengerId', 'Survived'], dtype='object')

In [33]:
submitDf = pd.DataFrame({"PassengerId" : submit["PassengerId"] , "Survived" : y_pred})

In [34]:
submitDf.to_csv(r"../Data/Predictions/Stacking_v2.csv" , index = False)