In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import optuna

import random

import warnings

warnings.filterwarnings('ignore')

<h2 style="background-color:azure; text-align:center; font-size:300%">Level1 Classification</h2>

<h2 style="background-color:azure; text-align:center; font-size:200%">XGBOOST</h2>

In [None]:
train1=pd.read_csv('../input/training-apr/train.csv')
test1=pd.read_csv('../input/training-apr/test.csv')

train2=train1.copy()
test2=test1.copy()

train1.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_cat'], inplace=True)
test1.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_cat'], inplace=True)

train2.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_log'], inplace=True)
test2.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_log'], inplace=True)

print("set 1:", train1.columns, test1.columns)
print("set 2:", train2.columns, test2.columns)

In [None]:
print("is finite set1 Train:{} Test:{}".format(np.isfinite(train1[['Age','Fare_log','related_log']]).any()[0],np.isfinite(test1[['Age','Fare_log','related_log']]).any()[0]))
print("is finite set2 Train:{} Test:{}".format(np.isfinite(train2[['Age','Fare_log']]).any()[0],np.isfinite(test2[['Age','Fare_log']]).any()[0]))

print("is nan set1 Train:{} Test:{}".format(sum(train1.isnull().sum()),sum(test1.isnull().sum())))
print("is nan set2 Train:{} Test:{}".format(sum(train2.isnull().sum()),sum(train2.isnull().sum())))

In [None]:
score_train=pd.DataFrame(columns=['xg','rf','lg','cb']) # train data
score_test=pd.DataFrame(columns=['xg','rf','lg','cb']) # leadeboard data

<h2 style="background-color:azure; text-align:center; font-size:200%">XGBOOST set1</h2>

In [None]:
ohe=OneHotEncoder()
col=['Sex','Embarked']
ohe.fit(train1[col])
print(ohe.get_feature_names(col))
df1=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(train1[col]).toarray())
df2=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(test1[col]).toarray())

train1=train1.join(df1)
test1=test1.join(df2)

train1.drop(columns=['Sex','Embarked'], inplace=True)
test1.drop(columns=['Sex','Embarked'], inplace=True)

In [None]:
train1.head()

In [None]:
test1.head()

In [None]:
folds=5
SEED=random.randint(937,8641)

kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

features=train1.columns[1:]
X = train1[features]
y = train1['Survived']


imbalanced_ratio=(train1[train1['Survived']==0]['Survived'].count()/train1[train1['Survived']==1]['Survived'].count()).round(2)
print("Imbalnce ratio: {:}".format(imbalanced_ratio))
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)

print("Distribution of train and test:", len(x_train), len(y_train), len(x_test), len(x_test))

In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'verbosity': 1,
        'objective': 'binary:logistic',
        'random_state': SEED,
        'seed': SEED,
        'tree_method':'hist',
        'scale_pos_weight': imbalanced_ratio,
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators',500, 20000),
        'max_depth': trial.suggest_int('max_depth', 1, 31),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 1000)
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    xgboost_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        xgboost = XGBClassifier(**para)
        
        model =  xgboost.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], 
                             eval_metric=["error", "logloss"],verbose=0, early_stopping_rounds=50)
        pred_train = model.predict(xtrain)
        pred_val = model.predict(xval)
        xgboost_train_preds[val_ind]=pred_val
        score1 = accuracy_score(ytrain, pred_train)
        score2 = accuracy_score(yval, pred_val)
        print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))
        
        results = model.evals_result()
        df=pd.DataFrame({
                        "validation_train_ll":results["validation_0"]["logloss"],
                        "validation_test_ll":results["validation_1"]["logloss"],
                        "validation_train_acc":results["validation_0"]["error"],
                        "validation_test_acc":results["validation_1"]["error"],
                        
        })
        df['validation_train_acc']=(1-df['validation_train_acc'])*100.0
        df['validation_test_acc']=(1-df['validation_test_acc'])*100.0
#         print(df.head())
        
        fig, ax = plt.subplots(ncols=2, figsize=(12,6))
        sns.lineplot(data=df, x=df.index, y="validation_train_ll", ax=ax[0], label="Train loss")
        sns.lineplot(data=df, x=df.index, y="validation_test_ll", ax=ax[0], label="Test loss")
        sns.lineplot(data=df, x=df.index, y="validation_train_acc", ax=ax[1], label="Train acc.")
        sns.lineplot(data=df, x=df.index, y="validation_test_acc", ax=ax[1], label="Test acc.")
        ax[0].set_title("Loss curve")
        ax[1].set_title("Accuracy curve")
        ax[0].set_ylabel("Loss")
        ax[0].set_xlabel("Itertation")
        ax[1].set_ylabel("Accuracy")
        ax[1].set_xlabel("Itertation")
        fig.suptitle("XGBoost Loss/Accuracy.")
        plt.show()
    acc=accuracy_score(y, xgboost_train_preds)
    print('OOF ACCURACY: {}'.format(acc))
    return acc

In [None]:
study=optuna.create_study(study_name="XGBoost set 1 Optimization", direction='maximize')
study.optimize(objective, n_trials=25)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

print("Best hyperparameters: {}".format(trial.params))

In [None]:
#Tuning keeping the N-estimator same as the above best parameter
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'verbosity': 1,
        'objective': 'binary:logistic',
        'random_state': SEED,
        'seed': SEED,
        'tree_method':'hist',
        'scale_pos_weight': imbalanced_ratio,
        'n_estimators': 7657,
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 1, 31),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 1000)
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    xgboost_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        xgboost = XGBClassifier(**para)
        
        model =  xgboost.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], 
                             eval_metric=["error", "logloss"],verbose=0, early_stopping_rounds=50)
        pred_train = model.predict(xtrain)
        pred_val = model.predict(xval)
        xgboost_train_preds[val_ind]=pred_val
        score1 = accuracy_score(ytrain, pred_train)
        score2 = accuracy_score(yval, pred_val)
        print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))
        
        results = model.evals_result()
        df=pd.DataFrame({
                        "validation_train_ll":results["validation_0"]["logloss"],
                        "validation_test_ll":results["validation_1"]["logloss"],
                        "validation_train_acc":results["validation_0"]["error"],
                        "validation_test_acc":results["validation_1"]["error"],
                        
        })
        df['validation_train_acc']=(1-df['validation_train_acc'])*100.0
        df['validation_test_acc']=(1-df['validation_test_acc'])*100.0
#         print(df.head())
        
        fig, ax = plt.subplots(ncols=2, figsize=(12,6))
        sns.lineplot(data=df, x=df.index, y="validation_train_ll", ax=ax[0], label="Train loss")
        sns.lineplot(data=df, x=df.index, y="validation_test_ll", ax=ax[0], label="Test loss")
        sns.lineplot(data=df, x=df.index, y="validation_train_acc", ax=ax[1], label="Train acc.")
        sns.lineplot(data=df, x=df.index, y="validation_test_acc", ax=ax[1], label="Test acc.")
        ax[0].set_title("Loss curve")
        ax[1].set_title("Accuracy curve")
        ax[0].set_ylabel("Loss")
        ax[0].set_xlabel("Itertation")
        ax[1].set_ylabel("Accuracy")
        ax[1].set_xlabel("Itertation")
        fig.suptitle("XGBoost Loss/Accuracy.")
        plt.show()
    acc=accuracy_score(y, xgboost_train_preds)
    print('OOF ACCURACY: {}'.format(acc))
    return acc

In [None]:
study=optuna.create_study(study_name="XGBoost Optimization", direction='maximize')
study.optimize(objective, n_trials=15)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

print("Best hyperparameters: {}".format(trial.params))

<h2 style="background-color:azure; text-align:center; font-size:200%">XGBOOST tuned model set1</h2>

In [None]:
para = {
        'verbosity': 1,
        'objective': 'binary:logistic',
        'random_state': SEED,
        'seed': SEED,
        'tree_method':'hist',
        'scale_pos_weight': imbalanced_ratio,
        'lambda': 0.040336438299178316, 
        'alpha': 1.6115451006296893, 
        'colsample_bytree': 0.6421153349186888, 
        'subsample': 0.9948174596370332, 
        'learning_rate': 0.08617507766633564, 
        'n_estimators': 7657, 
        'max_depth': 18, 
        'min_child_weight': 114
      }

In [None]:
xg_train_preds = np.zeros(len(y_train),)
xg_test_preds = np.zeros(len(y_test),)
xg_test = np.zeros(len(test1),)
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    
    xgboost = XGBClassifier(**para)

    model = xgboost.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], 
                         eval_metric=["error", "logloss"], verbose=0, early_stopping_rounds=50)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    xg_train_preds[val_ind] = pred_val
    xg_test_preds += (model.predict_proba(x_test)[:,1])/folds
    xg_test += (model.predict_proba(test1)[:,1])/folds
    score1 = accuracy_score(ytrain, np.where(pred_train<=0.5, 0, 1))
    score2 = accuracy_score(yval, np.where(pred_val<=0.5, 0, 1))
    print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))

    results = model.evals_result()
    df=pd.DataFrame({
                    "validation_train_ll":results["validation_0"]["logloss"],
                    "validation_test_ll":results["validation_1"]["logloss"],
                    "validation_train_acc":results["validation_0"]["error"],
                    "validation_test_acc":results["validation_1"]["error"],

    })
    df['validation_train_acc']=(1-df['validation_train_acc'])*100.0
    df['validation_test_acc']=(1-df['validation_test_acc'])*100.0
    #         print(df.head())

    fig, ax = plt.subplots(ncols=2, figsize=(12,6))
    sns.lineplot(data=df, x=df.index, y="validation_train_ll", ax=ax[0], label="Train loss")
    sns.lineplot(data=df, x=df.index, y="validation_test_ll", ax=ax[0], label="Test loss")
    sns.lineplot(data=df, x=df.index, y="validation_train_acc", ax=ax[1], label="Train acc.")
    sns.lineplot(data=df, x=df.index, y="validation_test_acc", ax=ax[1], label="Test acc.")
    ax[0].set_title("Loss curve")
    ax[1].set_title("Accuracy curve")
    ax[0].set_ylabel("Loss")
    ax[0].set_xlabel("Itertation")
    ax[1].set_ylabel("Accuracy")
    ax[1].set_xlabel("Itertation")
    fig.suptitle("XGBoost Loss/Accuracy.")
    plt.show()
    
acc1 = accuracy_score(y_train, np.where(xg_train_preds<=0.5, 0, 1))
acc2 = accuracy_score(y_test, np.where(xg_test_preds<=0.5, 0, 1))
print('OOF ACCURACY Train: {} Test: {}'.format(acc1, acc2))

score_train['xg']=np.concatenate([xg_train_preds,xg_test_preds])
score_test['xg']=xg_test

In [None]:
score_train.head()

In [None]:
score_test.head()

In [None]:
score_train.to_csv('./score_train.csv',index=False)
score_test.to_csv('./score_test.csv',index=False)

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=score_test['xg']
df['Survived']=df['Survived'].apply(lambda x:0 if x<=0.5 else 1)
df.to_csv('./xg_tuned.csv',index=False)

<h2 style="background-color:azure; text-align:center; font-size:200%">XGBOOST set2</h2>

In [None]:
ohe=OneHotEncoder()
col=['Sex','Embarked']
ohe.fit(train2[col])
print(ohe.get_feature_names(col))
df1=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(train2[col]).toarray())
df2=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(test2[col]).toarray())

train2=train2.join(df1)
test2=test2.join(df2)

train2['related_cat'] = train2['related_cat'].apply(lambda x:1 if x in ['low'] else 2)
test2['related_cat'] = test2['related_cat'].apply(lambda x:1 if x in ['low'] else 2)

train2.drop(columns=['Sex','Embarked'], inplace=True)
test2.drop(columns=['Sex','Embarked'], inplace=True)

In [None]:
train2.head()

In [None]:
test2.head()

In [None]:
folds=5
SEED=random.randint(937,8641)

kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

features=train2.columns[1:]
X = train2[features]
y = train2['Survived']


imbalanced_ratio=(train2[train2['Survived']==0]['Survived'].count()/train2[train2['Survived']==1]['Survived'].count()).round(2)
print("Imbalnce ratio: {:}".format(imbalanced_ratio))

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)
print("Distribution of train and test:", len(x_train), len(y_train), len(x_test), len(x_test))

In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'verbosity': 1,
        'objective': 'binary:logistic',
        'random_state': SEED,
        'seed': SEED,
        'tree_method':'hist',
        'scale_pos_weight': imbalanced_ratio,
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators',500, 20000),
        'max_depth': trial.suggest_int('max_depth', 1, 31),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 1000)
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    xgboost_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        callbacks=[early_stopping_round]
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        xgboost = XGBClassifier(**para)
        
        model =  xgboost.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], 
                             eval_metric=["error", "logloss"],verbose=0,callbacks=[early_stopping_round])
        pred_train = model.predict(xtrain)
        pred_val = model.predict(xval)
        xgboost_train_preds[val_ind]=pred_val
        score1 = accuracy_score(ytrain, pred_train)
        score2 = accuracy_score(yval, pred_val)
        print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))
        
        results = model.evals_result()
        df=pd.DataFrame({
                        "validation_train_ll":results["validation_0"]["logloss"],
                        "validation_test_ll":results["validation_1"]["logloss"],
                        "validation_train_acc":results["validation_0"]["error"],
                        "validation_test_acc":results["validation_1"]["error"],
                        
        })
        df['validation_train_acc']=(1-df['validation_train_acc'])*100.0
        df['validation_test_acc']=(1-df['validation_test_acc'])*100.0
#         print(df.head())
        
        fig, ax = plt.subplots(ncols=2, figsize=(12,6))
        sns.lineplot(data=df, x=df.index, y="validation_train_ll", ax=ax[0], label="Train loss")
        sns.lineplot(data=df, x=df.index, y="validation_test_ll", ax=ax[0], label="Test loss")
        sns.lineplot(data=df, x=df.index, y="validation_train_acc", ax=ax[1], label="Train acc.")
        sns.lineplot(data=df, x=df.index, y="validation_test_acc", ax=ax[1], label="Test acc.")
        ax[0].set_title("Loss curve")
        ax[1].set_title("Accuracy curve")
        ax[0].set_ylabel("Loss")
        ax[0].set_xlabel("Itertation")
        ax[1].set_ylabel("Accuracy")
        ax[1].set_xlabel("Itertation")
        fig.suptitle("XGBoost Loss/Accuracy.")
        plt.show()
    acc=accuracy_score(y, xgboost_train_preds)
    print('OOF ACCURACY: {}'.format(acc))
    return acc

In [None]:
study=optuna.create_study(study_name="XGBoost set 2 Optimization", direction='maximize')
study.optimize(objective, n_trials=50)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

print("Best hyperparameters: {}".format(trial.params))

<h2 style="background-color:azure; text-align:center; font-size:200%">XGBOOST set2 tuned model</h2>

In [None]:
para = {
        'verbosity': 1,
        'objective': 'binary:logistic',
        'random_state': SEED,
        'seed': SEED,
        'tree_method':'hist',
        'scale_pos_weight': imbalanced_ratio,
        'lambda': 0.23870865587316725, 
        'alpha': 0.05851635206035666, 
        'colsample_bytree': 0.1050482977664344, 
        'subsample': 0.9719852687976757, 
        'learning_rate': 0.06744568400126143, 
        'n_estimators': 19537, 'max_depth': 3, 
        'min_child_weight': 498
       }

In [None]:
xg_train_preds = np.zeros(len(y_train),)
xg_test_preds = np.zeros(len(y_test),)
xg_test = np.zeros(len(test2),)
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    
    xgboost = XGBClassifier(**para)

    model = xgboost.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], 
                         eval_metric=["error", "logloss"], verbose=0, early_stopping_rounds=50)
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    xg_train_preds[val_ind] = pred_val
    xg_test_preds += (model.predict_proba(x_test)[:,1])/folds
    xg_test += (model.predict_proba(test2)[:,1])/folds
    score1 = accuracy_score(ytrain, np.where(pred_train<=0.5, 0, 1))
    score2 = accuracy_score(yval, np.where(pred_val<=0.5, 0, 1))
    print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))

    results = model.evals_result()
    df=pd.DataFrame({
                    "validation_train_ll":results["validation_0"]["logloss"],
                    "validation_test_ll":results["validation_1"]["logloss"],
                    "validation_train_acc":results["validation_0"]["error"],
                    "validation_test_acc":results["validation_1"]["error"],

    })
    df['validation_train_acc']=(1-df['validation_train_acc'])*100.0
    df['validation_test_acc']=(1-df['validation_test_acc'])*100.0
    #         print(df.head())

    fig, ax = plt.subplots(ncols=2, figsize=(12,6))
    sns.lineplot(data=df, x=df.index, y="validation_train_ll", ax=ax[0], label="Train loss")
    sns.lineplot(data=df, x=df.index, y="validation_test_ll", ax=ax[0], label="Test loss")
    sns.lineplot(data=df, x=df.index, y="validation_train_acc", ax=ax[1], label="Train acc.")
    sns.lineplot(data=df, x=df.index, y="validation_test_acc", ax=ax[1], label="Test acc.")
    ax[0].set_title("Loss curve")
    ax[1].set_title("Accuracy curve")
    ax[0].set_ylabel("Loss")
    ax[0].set_xlabel("Itertation")
    ax[1].set_ylabel("Accuracy")
    ax[1].set_xlabel("Itertation")
    fig.suptitle("XGBoost Loss/Accuracy.")
    plt.show()
    
acc1 = accuracy_score(y_train, np.where(xg_train_preds<=0.5, 0, 1))
acc2 = accuracy_score(y_test, np.where(xg_test_preds<=0.5, 0, 1))
print('OOF ACCURACY Train: {} Test: {}'.format(acc1, acc2))

# score_train['xg']=np.concatenate([xg_train_preds,xg_test_preds])
# score_test['xg']=xg_test

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=xg_test
df['Survived']=df['Survived'].apply(lambda x:0 if x<=0.5 else 1)
df.to_csv('./xg_tuned_set2.csv',index=False)

<span style="background-color:orange; font-size:150%">**Observation**</span>
* For XGBOOST train set 1 gave better results in public leaderboard compared to train set 2.  

<h2 style="background-color:azure; text-align:center; font-size:200%">LightGBM</h2>

In [None]:
train1=pd.read_csv('../input/training-apr/train.csv')
test1=pd.read_csv('../input/training-apr/test.csv')

train2=train1.copy()
test2=test1.copy()

train1.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_cat'], inplace=True)
test1.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_cat'], inplace=True)

train2.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_log'], inplace=True)
test2.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_log'], inplace=True)

print("set 1:", train1.columns, test1.columns)
print("set 2:", train2.columns, test2.columns)

<span style="background-color:orange; font-size:150%">**Important Point**</span>
* The cat features of lightgbm should only be used when you have high cardinality in categorical features. 
* It is common to represent categorical features with one-hot encoding, but this approach is suboptimal for tree learners. Particularly for high-cardinality categorical features, a tree built on one-hot features tends to be unbalanced and needs to grow very deep to achieve good accuracy.Instead of one-hot encoding, the optimal solution is to split on a categorical feature by partitioning its categories into 2 subsets. If the feature has k categories, there are 2^(k-1) - 1 possible partitions. But there is an efficient solution for regression trees. It needs about O(k * log(k)) to find the optimal partition.

<h2 style="background-color:azure; text-align:center; font-size:200%">LightGBM set1</h2>

In [None]:
ohe=OneHotEncoder()
col=['Sex','Embarked']
ohe.fit(train1[col])
print(ohe.get_feature_names(col))
df1=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(train1[col]).toarray())
df2=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(test1[col]).toarray())

train1=train1.join(df1)
test1=test1.join(df2)

train1.drop(columns=['Sex','Embarked'], inplace=True)
test1.drop(columns=['Sex','Embarked'], inplace=True)

In [None]:
train1.head()

In [None]:
test1.head()

In [None]:
folds=5
SEED=random.randint(937,8641)

kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

features=train1.columns[1:]
X = train1[features]
y = train1['Survived']


imbalanced_ratio=(train1[train1['Survived']==0]['Survived'].count()/train1[train1['Survived']==1]['Survived'].count()).round(2)
print("Imbalnce ratio: {:}".format(imbalanced_ratio))
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)

print("Distribution of train and test:", len(x_train), len(y_train), len(x_test), len(x_test))

In [None]:
Trial=0

def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para = {
              'verbosity': 1,
              'random_state': SEED,
              'n_jobs': -1,
              'is_unbalance': True,
              'bagging_seed': SEED,
              'feature_fraction_seed': SEED,
              'objective': 'binary', 
              'boosting': trial.suggest_categorical('boosting', ['gbdt','rf']),
              'n_estimators': trial.suggest_int('n_estimators',500, 20000),
              'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
              'max_depth': trial.suggest_int('max_depth', 6, 127),
              'num_leaves': trial.suggest_int('num_leaves', 31, 128),
              'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
              'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
              'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.9),
              'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
              'bagging_freq': trial.suggest_int('bagging_freq', 50, 15000),
              'bagging_fraction': trial.suggest_float('bagging_fraction', 0, 0.9),
              'max_bin': trial.suggest_int('max_bin', 128, 1024)
            }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    lgbm_train_preds = np.zeros(len(y),)
    
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        
        lgbm = LGBMClassifier(**para)
        
        early_stopping = lightgbm.early_stopping(25, first_metric_only=True, verbose=True)
        
        model =  lgbm.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], categorical_feature=None,
                          eval_metric = ['binary_logloss', 'binary_error'], verbose=100, callbacks=[early_stopping])
        
        pred_train = model.predict(xtrain, num_iteration=model.best_iteration_)
        pred_val = model.predict(xval, num_iteration=model.best_iteration_)
        lgbm_train_preds[val_ind]=pred_val
        score1 = accuracy_score(ytrain, pred_train)
        score2 = accuracy_score(yval, pred_val)
        print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))
        
        results = model.evals_result_
        df=pd.DataFrame({
                        "train_ll":results["training"]["binary_logloss"],
                        "validation_ll":results["valid_1"]["binary_logloss"],
                        "train_acc":results["training"]["binary_error"],
                        "test_acc":results["valid_1"]["binary_error"],
        })
        df['train_acc']=(1-df['train_acc'])*100.0
        df['test_acc']=(1-df['test_acc'])*100.0
#         print(df.head())
        
        fig, ax = plt.subplots(ncols=2, figsize=(12,6))
        sns.lineplot(data=df, x=df.index, y="train_ll", ax=ax[0], label="Train loss")
        sns.lineplot(data=df, x=df.index, y="validation_ll", ax=ax[0], label="Validation loss")
        sns.lineplot(data=df, x=df.index, y="train_acc", ax=ax[1], label="Train acc.")
        sns.lineplot(data=df, x=df.index, y="test_acc", ax=ax[1], label="Validation acc.")
        ax[0].set_title("Loss curve")
        ax[1].set_title("Accuracy curve")
        ax[0].set_ylabel("Loss")
        ax[0].set_xlabel("Itertation")
        ax[1].set_ylabel("Accuracy")
        ax[1].set_xlabel("Itertation")
        fig.suptitle("LightGBM Loss/Accuracy.")
        plt.show()
    acc=accuracy_score(y, lgbm_train_preds)
    print('OOF ACCURACY: {}'.format(acc))
    return acc

In [None]:
study=optuna.create_study(study_name="LightGBM set 1 Optimization", direction='maximize')
study.optimize(objective, n_trials=25)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

print("Best hyperparameters: {}".format(trial.params))

<h2 style="background-color:azure; text-align:center; font-size:200%">LightGBM set1 tuned model</h2>

In [None]:
para = {
        'verbosity': 1,
        'random_state': SEED,
        'n_jobs': -1,
        'is_unbalance': True,
        'bagging_seed': SEED,
        'feature_fraction_seed': SEED,
        'objective': 'binary',
        'boosting': 'gbdt', 
        'n_estimators': 12792, 
        'learning_rate': 0.0998109583959103, 
        'max_depth': 67, 
        'num_leaves': 81, 
        'reg_alpha': 8.95628715211493, 
        'reg_lambda': 2.5201711907717668, 
        'feature_fraction': 0.591553893731912, 
        'min_child_samples': 206, 
        'bagging_freq': 2628, 
        'bagging_fraction': 0.661876388933661, 
        'max_bin': 230
       }

In [None]:
lg_train_preds = np.zeros(len(y_train),)
lg_test_preds = np.zeros(len(y_test),)
lg_test = np.zeros(len(test1),)
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    
    early_stopping_round = lightgbm.early_stopping(25, first_metric_only=True, verbose=True)
    
    lgbm = LGBMClassifier(**para)
        
    model =  lgbm.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], categorical_feature=None,
                      eval_metric = ['binary_logloss', 'binary_error'], verbose=100, callbacks=[early_stopping_round])
    pred_train = model.predict_proba(xtrain, num_iteration=model.best_iteration_)[:,1]
    pred_val = model.predict_proba(xval, num_iteration=model.best_iteration_)[:,1]
    lg_train_preds[val_ind] = pred_val
    lg_test_preds += (model.predict_proba(x_test, num_iteration=model.best_iteration_)[:,1])/folds
    lg_test += (model.predict_proba(test1, num_iteration=model.best_iteration_)[:,1])/folds
    score1 = accuracy_score(ytrain, np.where(pred_train<=0.5, 0, 1))
    score2 = accuracy_score(yval, np.where(pred_val<=0.5, 0, 1))
    print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))

    results = model.evals_result_
    df=pd.DataFrame({
                    "train_ll":results["training"]["binary_logloss"],
                    "validation_ll":results["valid_1"]["binary_logloss"],
                    "train_acc":results["training"]["binary_error"],
                    "test_acc":results["valid_1"]["binary_error"],

    })
    df['train_acc']=(1-df['train_acc'])*100.0
    df['test_acc']=(1-df['test_acc'])*100.0
#         print(df.head())

    fig, ax = plt.subplots(ncols=2, figsize=(12,6))
    sns.lineplot(data=df, x=df.index, y="train_ll", ax=ax[0], label="Train loss")
    sns.lineplot(data=df, x=df.index, y="validation_ll", ax=ax[0], label="Validation loss")
    sns.lineplot(data=df, x=df.index, y="train_acc", ax=ax[1], label="Train acc.")
    sns.lineplot(data=df, x=df.index, y="test_acc", ax=ax[1], label="Validation acc.")
    ax[0].set_title("Loss curve")
    ax[1].set_title("Accuracy curve")
    ax[0].set_ylabel("Loss")
    ax[0].set_xlabel("Itertation")
    ax[1].set_ylabel("Accuracy")
    ax[1].set_xlabel("Itertation")
    fig.suptitle("LightGBM Loss/Accuracy.")
    plt.show()
    
acc1 = accuracy_score(y_train, np.where(lg_train_preds<=0.5, 0, 1))
acc2 = accuracy_score(y_test, np.where(lg_test_preds<=0.5, 0, 1))

print('OOF ACCURACY Train: {} Test: {}'.format(acc1, acc2))

score_train=pd.read_csv('../input/score-apr21/score_train.csv')
score_test=pd.read_csv('../input/score-apr21/score_test.csv')

score_train['lg']=np.concatenate([lg_train_preds,lg_test_preds])
score_test['lg']=lg_test

In [None]:
score_train.head()

In [None]:
score_test.head()

In [None]:
score_train.to_csv('./score_train.csv',index=False)
score_test.to_csv('./score_test.csv',index=False)

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=lg_test
df['Survived']=df['Survived'].apply(lambda x:0 if x<=0.5 else 1)
df.to_csv('./lg_tuned.csv',index=False)

<h2 style="background-color:azure; text-align:center; font-size:200%">LightGBM set2</h2>

In [None]:
ohe=OneHotEncoder()
col=['Sex','Embarked']
ohe.fit(train2[col])
print(ohe.get_feature_names(col))
df1=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(train2[col]).toarray())
df2=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(test2[col]).toarray())

train2=train2.join(df1)
test2=test2.join(df2)

train2['related_cat'] = train2['related_cat'].apply(lambda x:1 if x in ['low'] else 2)
test2['related_cat'] = test2['related_cat'].apply(lambda x:1 if x in ['low'] else 2)

train2.drop(columns=['Sex','Embarked'], inplace=True)
test2.drop(columns=['Sex','Embarked'], inplace=True)

In [None]:
train2.head()

In [None]:
test2.head()

In [None]:
folds=5
SEED=random.randint(937,8641)

kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

features=train2.columns[1:]
X = train2[features]
y = train2['Survived']


imbalanced_ratio=(train2[train2['Survived']==0]['Survived'].count()/train2[train2['Survived']==1]['Survived'].count()).round(2)
print("Imbalnce ratio: {:}".format(imbalanced_ratio))

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)
print("Distribution of train and test:", len(x_train), len(y_train), len(x_test), len(x_test))

In [None]:
Trial=0


def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para = {
              'verbosity': 1,
              'random_state': SEED,
              'n_jobs': -1,
              'is_unbalance': True,
              'bagging_seed': SEED,
              'feature_fraction_seed': SEED,
              'objective': 'binary',
              'boosting': trial.suggest_categorical('boosting', ['gbdt','rf']),
              'n_estimators': trial.suggest_int('n_estimators',500, 20000),
              'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
              'max_depth': trial.suggest_int('max_depth', 6, 127),
              'num_leaves': trial.suggest_int('num_leaves', 31, 128),
              'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
              'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
              'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.9),
              'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
              'bagging_freq': trial.suggest_int('bagging_freq', 50, 15000),
              'bagging_fraction': trial.suggest_float('bagging_fraction', 0, 0.9),
              'max_bin': trial.suggest_int('max_bin', 128, 1024)
            }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    lgbm_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        lgbm = LGBMClassifier(**para)
        
        early_stopping_round = lightgbm.early_stopping(25, first_metric_only=True, verbose=True)
        model =  lgbm.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], categorical_feature=None,
                             eval_metric = ['binary_logloss', 'binary_error'], verbose=100, callbacks=[early_stopping_round])
        
        pred_train = model.predict(xtrain, num_iteration=model.best_iteration_)
        pred_val = model.predict(xval, num_iteration=model.best_iteration_)
        lgbm_train_preds[val_ind]=pred_val
        score1 = accuracy_score(ytrain, pred_train)
        score2 = accuracy_score(yval, pred_val)
        print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))
        
        results = model.evals_result_
        df=pd.DataFrame({
                        "train_ll":results["training"]["binary_logloss"],
                        "validation_ll":results["valid_1"]["binary_logloss"],
                        "train_acc":results["training"]["binary_error"],
                        "test_acc":results["valid_1"]["binary_error"],
                        
        })
        df['train_acc']=(1-df['train_acc'])*100.0
        df['test_acc']=(1-df['test_acc'])*100.0
#         print(df.head())
        
        fig, ax = plt.subplots(ncols=2, figsize=(12,6))
        sns.lineplot(data=df, x=df.index, y="train_ll", ax=ax[0], label="Train loss")
        sns.lineplot(data=df, x=df.index, y="validation_ll", ax=ax[0], label="Validation loss")
        sns.lineplot(data=df, x=df.index, y="train_acc", ax=ax[1], label="Train acc.")
        sns.lineplot(data=df, x=df.index, y="test_acc", ax=ax[1], label="Validation acc.")
        ax[0].set_title("Loss curve")
        ax[1].set_title("Accuracy curve")
        ax[0].set_ylabel("Loss")
        ax[0].set_xlabel("Itertation")
        ax[1].set_ylabel("Accuracy")
        ax[1].set_xlabel("Itertation")
        fig.suptitle("LightGBM Loss/Accuracy.")
        plt.show()
    acc=accuracy_score(y, lgbm_train_preds)
    print('OOF ACCURACY: {}'.format(acc))
    return acc

In [None]:
study=optuna.create_study(study_name="LightGBM set 2 Optimization", direction='maximize')
study.optimize(objective, n_trials=25)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

print("Best hyperparameters: {}".format(trial.params))

<h2 style="background-color:azure; text-align:center; font-size:200%">LightGBM set2 tuned model</h2>

In [None]:
para = {
        'verbosity': 1,
        'random_state': SEED,
        'n_jobs': -1,
        'is_unbalance': True,
        'bagging_seed': SEED,
        'feature_fraction_seed': SEED,
        'objective': 'binary',
        'boosting': 'gbdt', 
        'n_estimators': 10701, 
        'learning_rate': 0.06955447670117614, 
        'max_depth': 65, 'num_leaves': 110, 
        'reg_alpha': 8.557750007560998, 
        'reg_lambda': 0.016304042294640997, 
        'feature_fraction': 0.2779304475599476, 
        'min_child_samples': 233, 
        'bagging_freq': 14930, 
        'bagging_fraction': 0.8838910149186466, 
        'max_bin': 370
       }

In [None]:
lg_train_preds = np.zeros(len(y_train),)
lg_test_preds = np.zeros(len(y_test),)
lg_test = np.zeros(len(test2),)
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    
    early_stopping_round = lightgbm.early_stopping(25, first_metric_only=True, verbose=True)
    
    lgbm = LGBMClassifier(**para)
        
    model =  lgbm.fit(xtrain, ytrain, eval_set=[(xtrain,ytrain), (xval,yval)], categorical_feature=None,
                      eval_metric = ['binary_logloss', 'binary_error'], verbose=100, callbacks=[early_stopping_round])
    pred_train = model.predict_proba(xtrain, num_iteration=model.best_iteration_)[:,1]
    pred_val = model.predict_proba(xval, num_iteration=model.best_iteration_)[:,1]
    lg_train_preds[val_ind] = pred_val
    lg_test_preds += (model.predict_proba(x_test, num_iteration=model.best_iteration_)[:,1])/folds
    lg_test += (model.predict_proba(test2, num_iteration=model.best_iteration_)[:,1])/folds
    score1 = accuracy_score(ytrain, np.where(pred_train<=0.5, 0, 1))
    score2 = accuracy_score(yval, np.where(pred_val<=0.5, 0, 1))
    print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))

    results = model.evals_result_
    df=pd.DataFrame({
                    "train_ll":results["training"]["binary_logloss"],
                    "validation_ll":results["valid_1"]["binary_logloss"],
                    "train_acc":results["training"]["binary_error"],
                    "test_acc":results["valid_1"]["binary_error"],

    })
    df['train_acc']=(1-df['train_acc'])*100.0
    df['test_acc']=(1-df['test_acc'])*100.0
#         print(df.head())

    fig, ax = plt.subplots(ncols=2, figsize=(12,6))
    sns.lineplot(data=df, x=df.index, y="train_ll", ax=ax[0], label="Train loss")
    sns.lineplot(data=df, x=df.index, y="validation_ll", ax=ax[0], label="Validation loss")
    sns.lineplot(data=df, x=df.index, y="train_acc", ax=ax[1], label="Train acc.")
    sns.lineplot(data=df, x=df.index, y="test_acc", ax=ax[1], label="Validation acc.")
    ax[0].set_title("Loss curve")
    ax[1].set_title("Accuracy curve")
    ax[0].set_ylabel("Loss")
    ax[0].set_xlabel("Itertation")
    ax[1].set_ylabel("Accuracy")
    ax[1].set_xlabel("Itertation")
    fig.suptitle("LightGBM Loss/Accuracy.")
    plt.show()
    
acc1 = accuracy_score(y_train, np.where(lg_train_preds<=0.5, 0, 1))
acc2 = accuracy_score(y_test, np.where(lg_test_preds<=0.5, 0, 1))

print('OOF ACCURACY Train: {} Test: {}'.format(acc1, acc2))

# score_train=pd.read_csv('../input/score-apr21/score_train.csv')
# score_test=pd.read_csv('../input/score-apr21/score_test.csv')

# score_train['lg']=np.concatenate([lg_train_preds,lg_test_preds])
# score_test['lg']=lg_test

In [None]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=lg_test
df['Survived']=df['Survived'].apply(lambda x:0 if x<=0.5 else 1)
df.to_csv('./lg_tuned_set2.csv',index=False)

<span style="background-color:orange; font-size:150%">**Observation**</span>
* For LightGBM train set 1 gave better results in public leaderboard compared to train set 2.

<h2 style="background-color:azure; text-align:center; font-size:200%">Random Forest</h2>

In [2]:
train1=pd.read_csv('../input/training-apr/train.csv')
test1=pd.read_csv('../input/training-apr/test.csv')

train2=train1.copy()
test2=test1.copy()

train1.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_cat'], inplace=True)
test1.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_cat'], inplace=True)

train2.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_log'], inplace=True)
test2.drop(columns=['PassengerId','Name','Cabin','Ticket','SibSp','Parch','Age_log','Fare','Fare_cat','SibSp_log','Parch_log','related','related_log'], inplace=True)

print("set 1:", train1.columns, test1.columns)
print("set 2:", train2.columns, test2.columns)

set 1: Index(['Survived', 'Pclass', 'Sex', 'Age', 'Embarked', 'Fare_log',
       'related_log'],
      dtype='object') Index(['Pclass', 'Sex', 'Age', 'Embarked', 'Fare_log', 'related_log'], dtype='object')
set 2: Index(['Survived', 'Pclass', 'Sex', 'Age', 'Embarked', 'Fare_log',
       'related_cat'],
      dtype='object') Index(['Pclass', 'Sex', 'Age', 'Embarked', 'Fare_log', 'related_cat'], dtype='object')


<h2 style="background-color:azure; text-align:center; font-size:200%">Random Forest set1</h2>

In [3]:
ohe=OneHotEncoder()
col=['Sex','Embarked']
ohe.fit(train1[col])
print(ohe.get_feature_names(col))
df1=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(train1[col]).toarray())
df2=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(test1[col]).toarray())

train1=train1.join(df1)
test1=test1.join(df2)

train1.drop(columns=['Sex','Embarked'], inplace=True)
test1.drop(columns=['Sex','Embarked'], inplace=True)

['Sex_female' 'Sex_male' 'Embarked_C' 'Embarked_Q' 'Embarked_S'
 'Embarked_X']


In [4]:
train1.head()

Unnamed: 0,Survived,Pclass,Age,Fare_log,related_log,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_X
0,1,3,36.0,3.337192,1.098612,0.0,1.0,0.0,0.0,1.0,0.0
1,0,1,36.0,2.66375,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0,1,0.33,4.280686,1.386294,0.0,1.0,0.0,0.0,1.0,0.0
3,0,1,19.0,2.64191,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,1,25.0,2.170196,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [5]:
test1.head()

Unnamed: 0,Pclass,Age,Fare_log,related_log,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_X
0,1,19.0,4.159039,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,53.0,1.918392,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,3,19.0,3.686627,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,2,25.0,2.634045,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,3,17.0,3.328268,1.098612,1.0,0.0,1.0,0.0,0.0,0.0


In [6]:
folds=5
SEED=random.randint(937,8641)

kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

features=train1.columns[1:]
X = train1[features]
y = train1['Survived']


imbalanced_ratio=(train1[train1['Survived']==0]['Survived'].count()/train1[train1['Survived']==1]['Survived'].count()).round(2)
print("Imbalnce ratio: {:}".format(imbalanced_ratio))
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)

print("Distribution of train and test:", len(x_train), len(y_train), len(x_test), len(x_test))

Imbalnce ratio: 1.34
Distribution of train and test: 80000 80000 20000 20000


In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'bootstrap': True,
        'n_jobs': -1,
        'verbose': 0,
        'random_state': SEED,
        'criterion': 'entropy',
        'n_estimators': trial.suggest_int('n_estimators',10, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 2000),
        'min_samples_split': trial.suggest_float('min_samples_split', 1e-4, 1e-1),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 1e-4, 1e-1),
        'max_features': trial.suggest_categorical("max_features", ['sqrt', 'log2']),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 500, 100000),
        'warm_start': trial.suggest_categorical('warm_start', [True, False]),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.2),
        'class_weight':trial.suggest_categorical("class_weight", ['balanced', 'balanced_subsample']),
        'max_samples': trial.suggest_float('max_samples', 0.5, 0.8)
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    rf_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        
        rf = RandomForestClassifier(**para)

        model =  rf.fit(xtrain, ytrain)
        
        pred_train = model.predict(xtrain)
        pred_val = model.predict(xval)
        rf_train_preds[val_ind]=pred_val
        score1 = accuracy_score(ytrain, pred_train)
        score2 = accuracy_score(yval, pred_val)
        print('Fold {} AUC Train: {} Validation: {}'.format(fold+1, score1, score2))
    
    acc=accuracy_score(y, rf_train_preds)
    print('OOF ACCURACY: {}'.format(acc))
    return acc

In [None]:
study=optuna.create_study(study_name="Random Forest set 1 Optimization", direction='maximize')
study.optimize(objective, n_trials=25)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

print("Best hyperparameters: {}".format(trial.params))

In [None]:
Trial=0
def objective(trial, x=x_train, y=y_train):
    global Trial
    
    para={
        'bootstrap': True,
        'n_jobs': -1,
        'verbose': 0,
        'random_state': SEED,
        'criterion': 'entropy',
        'n_estimators': 330, 
        'max_depth': 1417, 
        'min_samples_split': 0.010377003772355852, 
        'min_samples_leaf': 0.01865186498148891, 
        'max_features': 'log2', 
        'max_leaf_nodes': 87127, 
        'warm_start': False, 
        'min_impurity_decrease': 0.01122129357981094, 
        'class_weight': 'balanced_subsample', 
        'max_samples': 0.5985586520207642,
        'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 2e-1)
    }
    
    print("--------------------> Trial {} <--------------------".format(Trial))
    Trial=Trial + 1
    rf_train_preds = np.zeros(len(y),)
    for fold, (train_ind, val_ind) in enumerate(kf.split(x, y)):
        print("--> Fold {}".format(fold + 1))
        xtrain, xval = x.iloc[train_ind], x.iloc[val_ind]
        ytrain, yval = y.iloc[train_ind], y.iloc[val_ind]
        
        rf = RandomForestClassifier(**para)

        model =  rf.fit(xtrain, ytrain)
        
        pred_train = model.predict(xtrain)
        pred_val = model.predict(xval)
        rf_train_preds[val_ind]=pred_val
        score1 = accuracy_score(ytrain, pred_train)
        score2 = accuracy_score(yval, pred_val)
        print('Fold {} AUC Train: {} Validation: {}'.format(fold+1, score1, score2))
    
    acc=accuracy_score(y, rf_train_preds)
    print('OOF ACCURACY: {}'.format(acc))
    return acc

In [None]:
study=optuna.create_study(study_name="Random Forest set 1 Pruning", direction='maximize')
study.optimize(objective, n_trials=25)

trial = study.best_trial
print('Accuracy: {}'.format(trial.value))

print("Best hyperparameters: {}".format(trial.params))

<h2 style="background-color:azure; text-align:center; font-size:200%">Random Forest set1 tuned model</h2>

In [17]:
para = {
        'bootstrap': True,
        'n_jobs': -1,
        'verbose': 0,
        'random_state': SEED,
        'criterion': 'entropy',
        'n_estimators': 330, 
        'max_depth': 1417, 
        'min_samples_split': 0.010377003772355852, 
        'min_samples_leaf': 0.01865186498148891, 
        'max_features': 'log2', 
        'max_leaf_nodes': 87127, 
        'warm_start': False, 
        'min_impurity_decrease': 0.01122129357981094, 
        'class_weight': 'balanced_subsample', 
        'max_samples': 0.5985586520207642,
        'ccp_alpha': 0.018500631807288694
       }

In [18]:
rf_train_preds = np.zeros(len(y_train),)
rf_test_preds = np.zeros(len(y_test),)
rf_test = np.zeros(len(test1),)
for fold, (train_ind, val_ind) in enumerate(kf.split(x_train, y_train)):
    print("--> Fold {}".format(fold + 1))
    
    xtrain, xval = x_train.iloc[train_ind], x_train.iloc[val_ind]
    ytrain, yval = y_train.iloc[train_ind], y_train.iloc[val_ind]
    
    rf = RandomForestClassifier(**para)

    model =  rf.fit(xtrain, ytrain)
    
    pred_train = model.predict_proba(xtrain)[:,1]
    pred_val = model.predict_proba(xval)[:,1]
    rf_train_preds[val_ind] = pred_val
    rf_test_preds += (model.predict_proba(x_test)[:,1])/folds
    rf_test += (model.predict_proba(test1)[:,1])/folds
    score1 = accuracy_score(ytrain, np.where(pred_train<=0.5, 0, 1))
    score2 = accuracy_score(yval, np.where(pred_val<=0.5, 0, 1))
    print('Fold {} ACCURACY Train: {} Validation: {}'.format(fold+1, score1, score2))
    
acc1 = accuracy_score(y_train, np.where(rf_train_preds<=0.5, 0, 1))
acc2 = accuracy_score(y_test, np.where(rf_test_preds<=0.5, 0, 1))

print('OOF ACCURACY Train: {} Test: {}'.format(acc1, acc2))

score_train=pd.read_csv('../input/score-tab-apr21/score_train.csv')
score_test=pd.read_csv('../input/score-tab-apr21/score_test.csv')

score_train['rf']=np.concatenate([rf_train_preds,rf_test_preds])
score_test['rf']=rf_test

--> Fold 1
Fold 1 ACCURACY Train: 0.764296875 Validation: 0.763375
--> Fold 2
Fold 2 ACCURACY Train: 0.763375 Validation: 0.7670625
--> Fold 3
Fold 3 ACCURACY Train: 0.76271875 Validation: 0.7696875
--> Fold 4
Fold 4 ACCURACY Train: 0.764640625 Validation: 0.762
--> Fold 5
Fold 5 ACCURACY Train: 0.76553125 Validation: 0.7584375
OOF ACCURACY Train: 0.7641125 Test: 0.7712


In [12]:
score_train.head()

Unnamed: 0,xg,rf,lg,cb
0,0.893169,0.228584,0.223467,
1,0.760047,0.780058,0.307224,
2,0.787606,0.206423,0.67333,
3,0.55569,0.762223,0.142897,
4,0.294982,0.784771,0.365371,


In [13]:
score_test.head()

Unnamed: 0,xg,rf,lg,cb
0,0.16531,0.237214,0.127184,
1,0.607289,0.667611,0.601769,
2,0.935854,0.781036,0.930494,
3,0.2561,0.282167,0.228358,
4,0.813994,0.781036,0.801692,


In [14]:
score_train.to_csv('./score_train.csv',index=False)
score_test.to_csv('./score_test.csv',index=False)

In [19]:
test_=pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
df=pd.DataFrame()
df['PassengerId']=test_['PassengerId'].values
df['Survived']=rf_test
df['Survived']=df['Survived'].apply(lambda x:0 if x<=0.5 else 1)
df.to_csv('./rf_tuned.csv',index=False)

<h2 style="background-color:azure; text-align:center; font-size:200%">Random Forest set2</h2>

In [None]:
ohe=OneHotEncoder()
col=['Sex','Embarked']
ohe.fit(train2[col])
print(ohe.get_feature_names(col))
df1=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(train2[col]).toarray())
df2=pd.DataFrame(columns=ohe.get_feature_names(col),data=ohe.transform(test2[col]).toarray())

train2=train2.join(df1)
test2=test2.join(df2)

train2['related_cat'] = train2['related_cat'].apply(lambda x:1 if x in ['low'] else 2)
test2['related_cat'] = test2['related_cat'].apply(lambda x:1 if x in ['low'] else 2)

train2.drop(columns=['Sex','Embarked'], inplace=True)
test2.drop(columns=['Sex','Embarked'], inplace=True)

In [None]:
train2.head()

In [None]:
test2.head()

In [None]:
folds=5
SEED=random.randint(937,8641)

kf=StratifiedKFold(n_splits=folds, random_state=SEED, shuffle=True)

features=train2.columns[1:]
X = train2[features]
y = train2['Survived']


imbalanced_ratio=(train2[train2['Survived']==0]['Survived'].count()/train2[train2['Survived']==1]['Survived'].count()).round(2)
print("Imbalnce ratio: {:}".format(imbalanced_ratio))

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED)
print("Distribution of train and test:", len(x_train), len(y_train), len(x_test), len(x_test))