In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold,train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

import optuna
import pickle

import warnings

warnings.filterwarnings('ignore')

* CatBoost (cb) can handle categorical data without any transforms. So I just passed the columns directly to CatBoost unchanged.
* lightGBM (lgb) can handle categorical data, but only if each label has been transformed to an integer value first. So I passed the label encoded values to LightGBM.
* For XGBoost (xgb), Ridge (ridge), and Stochastic Gradient Descent (sgd), each of them require continuous data to work.

In [None]:
train=pd.read_csv('../input/training-tabular-apr-2021/train2.csv')
test=pd.read_csv('../input/training-tabular-apr-2021/test2.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Embarked','Sex'], inplace=True)
test.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Embarked','Sex'], inplace=True)

In [None]:
train[train.columns[1:]]=train[train.columns[1:]].astype(np.float32)
test=test.astype(np.float32)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
np.isfinite(train).any()

In [None]:
np.isfinite(test).any()

In [None]:
np.isnan(train).any()

In [None]:
np.isnan(test).any()

In [None]:
train.head()

In [None]:
test.head()

<h2 style="background-color:azure; text-align:center; font-size:300%">5. Model Tuning</h2>

<h2 style="background-color:azure; text-align:center; font-size:200%">5.1.1. Random Forest Tuning</h2>

In [None]:
features=train.columns[1:]
X = train[features]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=39)

In [None]:
folds=3
SEED=793
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'n_estimators': trial.suggest_int('n_estimators',10, 1000),
        'criterion': 'entropy',
        'max_depth': trial.suggest_int('max_depth', 3, 200),
        'min_samples_split': trial.suggest_float('min_samples_split', 1e-4, 1e-1),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 1e-4, 1e-1),
        'max_features': trial.suggest_categorical("max_features", ['sqrt', 'log2']),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 8, 10000),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0, 0.1),
        'bootstrap': True,
        'n_jobs': -1,
        'verbose': True,
        'class_weight':trial.suggest_categorical("class_weight", ['balanced', 'balanced_subsample']),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1),
        'random_state': SEED
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    rf_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        rf = RandomForestClassifier(**para)

        model =  rf.fit(xtrain, ytrain)
        pred_train = model.predict_proba(xtrain)[:,1]
        pred_val = model.predict_proba(xval)[:,1]
        rf_train_preds[val_ind]=pred_val
        score1 = roc_auc_score(ytrain, pred_train)
        score2 = roc_auc_score(yval, pred_val)
        print('Fold {} AUC Train: {} Validation: {}'.format(fold+1, score1, score2))
    
    auc=roc_auc_score(y, rf_train_preds)
    print('OOF AUC: {}'.format(auc))
    return auc

In [None]:
study=optuna.create_study(study_name="Random Forest Optimization", direction='maximize')
study.optimize(objective, n_trials=20)

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

In [None]:
print("Best hyperparameters: {}".format(trial.params))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
print(study.trials_dataframe())
study.trials_dataframe().to_csv("trial_parameters.csv", index=False)
with open('./study.pickle', 'wb') as f:
    pickle.dump(study, f)

In [None]:
folds=3
SEED=5661
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

<h2 style="background-color:azure; text-align:center; font-size:200%">5.1.2. Random Forest tuned model</h2>

In [None]:
para={
      'n_estimators': 898,
      'criterion': 'entropy',
      'max_depth': 8, 
      'max_leaf_nodes': 9058,
      'min_samples_split': 0.0671810090247945, 
      'min_samples_leaf': 0.04742472303688006, 
      'max_features': 'sqrt', 
      'min_impurity_decrease': 0.00010583321874846287,
      'bootstrap': True,
      'n_jobs': -1,
      'verbose': True, 
      'class_weight': 'balanced_subsample', 
      'max_samples': 0.8634669615516827,
      'random_state': SEED
}

In [None]:
rf_train_preds = np.zeros(len(y_train),)
rf_test_preds = np.zeros(len(y_test), )
rf_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    rf=RandomForestClassifier(**para)

    model =  rf.fit(xtrain, ytrain)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    rf_train_preds[val_ind]=pred_val
    rf_test_preds+= pred_test/folds
    rf_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {:.4f} Validation: {:.4f}'.format(fold+1, score1, score2))


In [None]:
print('OOF AUC Train: {:.2f} Test: {:.2f}'.format(roc_auc_score(y_train, rf_train_preds), roc_auc_score(y_test, rf_test_preds)))

score['rf'] = rf_TEST_preds

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['rf']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./rf_tuned.csv',index=False)

<h2 style="background-color:azure; text-align:center; font-size:200%">5.1.1. Random Forest psuedo labelling</h2>

In [None]:
test_rf=pd.read_csv('../input/score/rf_tuned.csv')

In [None]:
folds=5
SEED=793
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
features=train.columns[1:]
X = train[features]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=39)

In [None]:
X = X.append(test)
y=y.append(test_rf['Survived'])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=39)

In [None]:
len(x_train), len(x_test)

In [None]:
para={
      'n_estimators': 898,
      'criterion': 'entropy',
      'max_depth': 8, 
      'max_leaf_nodes': 9058,
      'min_samples_split': 0.0671810090247945, 
      'min_samples_leaf': 0.04742472303688006, 
      'max_features': 'sqrt', 
      'min_impurity_decrease': 0.00010583321874846287,
      'bootstrap': True,
      'n_jobs': 4,
      'verbose': True, 
      'class_weight': 'balanced_subsample', 
      'max_samples': 0.8634669615516827,
      'random_state': SEED
}

In [None]:
rf_pseudo_train_preds = np.zeros(len(y_train),)
rf_pseudo_test_preds = np.zeros(len(y_test), )
rf_pseudo_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    rf=RandomForestClassifier(**para)

    model =  rf.fit(xtrain, ytrain)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    rf_pseudo_train_preds[val_ind]=pred_val
    rf_pseudo_test_preds+= pred_test/folds
    rf_pseudo_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {:.5f} Validation: {:.5f}'.format(fold+1, score1, score2))

In [None]:
print('OOF AUC Train: {:.2f} Test: {:.2f}'.format(roc_auc_score(y_train, rf_pseudo_train_preds), roc_auc_score(y_test, rf_pseudo_test_preds)))

score['rf_pseudo'] = rf_pseudo_TEST_preds

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['rf_pseudo']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./rf_pseudo_tuned1.csv',index=False)

<h2 style="background-color:azure; text-align:center; font-size:200%">5.1.1. Random Forest psuedo labelling pruning</h2>

In [None]:
folds=3
SEED=793
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
features=train.columns[1:]
X = train[features]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=39)

In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
          'n_estimators': 898,
          'criterion': 'entropy',
          'max_depth': 8, 
          'max_leaf_nodes': 9058,
          'min_samples_split': 0.0671810090247945, 
          'min_samples_leaf': 0.04742472303688006, 
          'max_features': 'sqrt', 
          'min_impurity_decrease': 0.00010583321874846287,
          'bootstrap': True,
          'n_jobs': 4,
          'verbose': True, 
          'class_weight': 'balanced_subsample', 
          'max_samples': 0.8634669615516827,
          'random_state': SEED,
          'ccp_alpha': trial.suggest_float('ccp_alpha', 0, 1e-1)
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    rf_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        rf=RandomForestClassifier(**para)

        model =  rf.fit(xtrain, ytrain)
        pred_train = model.predict_proba(xtrain)[:,1]
        pred_val = model.predict_proba(xval)[:,1]
        rf_train_preds[val_ind]=pred_val
        score1 = roc_auc_score(ytrain, pred_train)
        score2 = roc_auc_score(yval, pred_val)
        print('Fold {} AUC Train: {} Validation: {}'.format(fold+1, score1, score2))
    
    auc=roc_auc_score(y, rf_train_preds)
    print('OOF AUC: {}'.format(auc))
    return auc

In [None]:
study=optuna.create_study(study_name="Random Forest pseudo labelling Optimization", direction='maximize')
study.optimize(objective, n_trials=20)

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

In [None]:
print("Best hyperparameters: {}".format(trial.params))

In [None]:
X = X.append(test)
y=y.append(test_rf['Survived'])

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=39)

In [None]:
len(x_train), len(x_test)

In [None]:
folds=3
SEED=793
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
para={
      'n_estimators': 898,
      'criterion': 'entropy',
      'max_depth': 8, 
      'max_leaf_nodes': 9058,
      'min_samples_split': 0.0671810090247945, 
      'min_samples_leaf': 0.04742472303688006, 
      'max_features': 'sqrt', 
      'min_impurity_decrease': 0.00010583321874846287,
      'bootstrap': True,
      'n_jobs': 4,
      'verbose': True, 
      'class_weight': 'balanced_subsample', 
      'max_samples': 0.8634669615516827,
      'ccp_alpha': 0.0004776691764536976,
      'random_state': SEED
}

In [None]:
rf_pseudo1_train_preds = np.zeros(len(y_train),)
rf_pseudo1_test_preds = np.zeros(len(y_test), )
rf_pseudo1_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    rf=RandomForestClassifier(**para)

    model =  rf.fit(xtrain, ytrain)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    rf_pseudo1_train_preds[val_ind]=pred_val
    rf_pseudo1_test_preds+= pred_test/folds
    rf_pseudo1_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {:.5f} Validation: {:.5f}'.format(fold+1, score1, score2))

In [None]:
print('OOF AUC Train: {:.2f} Test: {:.2f}'.format(roc_auc_score(y_train, rf_pseudo1_train_preds), roc_auc_score(y_test, rf_pseudo1_test_preds)))

score['rf_pseudo_tuned_pruned'] = rf_pseudo1_TEST_preds

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['rf_pseudo_tuned_pruned']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./rf_pseudo_tuned_pruned.csv',index=False)

<h2 style="background-color:azure; text-align:center; font-size:200%">5.2.1. AdaBoost tuning</h2>

In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'n_estimators': trial.suggest_int('n_estimators',10, 10000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 3),
        'algorithm': 'SAMME.R',
        'random_state': SEED
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial+1
    ad_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        ad=AdaBoostClassifier(**para)

        model =  ad.fit(xtrain, ytrain)
        pred_train = model.predict_proba(xtrain)[:,1]
        pred_val = model.predict_proba(xval)[:,1]
        ad_train_preds[val_ind]=pred_val
        score1 = roc_auc_score(ytrain, pred_train)
        score2 = roc_auc_score(yval, pred_val)
        print('Fold {} AUC Train: {} Validation: {}'.format(fold+1, score1, score2))
    
    auc=roc_auc_score(y, ad_train_preds)
    print('OOF AUC: {}'.format(auc))
    return auc

In [None]:
study=optuna.create_study(study_name="AdaBoost Optimization", direction='maximize')
study.optimize(objective, n_trials=10)

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

In [None]:
print("Best hyperparameters: {}".format(trial.params))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
print(study.trials_dataframe())
study.trials_dataframe().to_csv("trial_parameters.csv", index=False)
with open('./study.pickle', 'wb') as f:
    pickle.dump(study, f)

<h2 style="background-color:azure; text-align:center; font-size:200%">5.2.2. AdaBoost tuned model</h2>

In [None]:
para={
      'n_estimators': 3291,
      'learning_rate': 0.06936047632110985,
      'algorithm': 'SAMME.R',
      'random_state': SEED
}

In [None]:
ad_train_preds = np.zeros(len(y_train),)
ad_test_preds = np.zeros(len(y_test), )
ad_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    ad=AdaBoostClassifier(**para)

    model =  ad.fit(xtrain, ytrain)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    ad_train_preds[val_ind]=pred_val
    ad_test_preds+= pred_test/folds
    ad_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {:.4f} Validation: {:.4f}'.format(fold+1, score1, score2))

In [None]:
print('OOF AUC Train: {:.5f} Test: {:.5f}'.format(roc_auc_score(y_train, ad_train_preds), roc_auc_score(y_test, ad_test_preds)))

score['ad'] = ad_TEST_preds

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['ad']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./ad_tuned.csv',index=False)

<h2 style="background-color:azure; text-align:center; font-size:200%">5.3.1. XGBoost tuning</h2>

In [None]:
features=train.columns[1:]
X = train[features]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=39)

In [None]:
folds=3
SEED=793
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'verbosity': 1,
        'eval_metric': "auc",
        'tree_method':trial.suggest_categorical("tree_method", ['exact', 'approx', 'hist']),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators',500, 20000),
        'max_depth': trial.suggest_int('max_depth', 5, 1000),
        'random_state': 2021,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 1000),
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    xgboost_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        xgboost = XGBClassifier(**para)
        
        model =  xgboost.fit(xtrain, ytrain, eval_set=[(xval, yval)], verbose=1, early_stopping_rounds=50)
        pred_train = model.predict_proba(xtrain)[:,1]
        pred_val = model.predict_proba(xval)[:,1]
        xgboost_train_preds[val_ind]=pred_val
        score1 = roc_auc_score(ytrain, pred_train)
        score2 = roc_auc_score(yval, pred_val)
        print('Fold {} AUC Train: {} Validation: {}'.format(fold+1, score1, score2))
    
    auc=roc_auc_score(y, xgboost_train_preds)
    print('OOF AUC: {}'.format(auc))
    return auc

In [None]:
study=optuna.create_study(study_name="XGBoost Optimization", direction='maximize')
study.optimize(objective, n_trials=15)

In [None]:
trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

In [None]:
print("Best hyperparameters: {}".format(trial.params))

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study)

<h2 style="background-color:azure; text-align:center; font-size:200%">5.2.2. XGBoost tuned model</h2>

In [None]:
features=train.columns[1:]
X = train[features]
y = train['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=39)

In [None]:
folds=10
SEED=793
kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)
score=pd.DataFrame()

In [None]:
para={'eval_metric': "auc",'tree_method': 'hist', 'lambda': 0.015235209064507677, 'alpha': 0.015624821169143542, 'colsample_bytree': 0.9102330759766059, 'subsample': 0.5282153709855245, 'learning_rate': 0.023541287073875223, 'n_estimators': 14379, 'max_depth': 794, 'min_child_weight': 161, 'random_state': 2021}

In [None]:
xgboost_train_preds = np.zeros(len(y_train),)
xgboost_test_preds = np.zeros(len(y_test), )
xgboost_TEST_preds = np.zeros(len(test), )
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    xgboost = XGBClassifier(**para)
        
    model =  xgboost.fit(xtrain, ytrain, eval_set=[(xval, yval)], verbose=1, early_stopping_rounds=50)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    pred_test = model.predict_proba(x_test)[:,1]
    pred_TEST = model.predict_proba(test)[:,1]
    xgboost_train_preds[val_ind]=pred_val
    xgboost_test_preds+= pred_test/folds
    xgboost_TEST_preds+= pred_TEST/folds
    score1 = roc_auc_score(ytrain, pred_train)
    score2 = roc_auc_score(yval, pred_val)
    score3 = roc_auc_score(y_test, pred_test)
    print('Fold {} AUC Train: {:.4f} Validation: {:.4f}'.format(fold+1, score1, score2))

In [None]:
print('OOF AUC Train: {:.5f} Test: {:.5f}'.format(roc_auc_score(y_train, xgboost_train_preds), roc_auc_score(y_test, xgboost_test_preds)))

score['xgboost'] = xgboost_TEST_preds

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score['xgboost']
df['Survived']=df['Survived'].apply(lambda x:1 if x>=0.5 else 0)
df.to_csv('./xgboost_tuned.csv',index=False)