In [1]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
target = train['target']
train = train.drop(['id', 'target'], axis = 1)

In [4]:
from sklearn.preprocessing import LabelEncoder
cats = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7',
       'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18']

for cat in cats :
    le = LabelEncoder()
    le.fit(train[cat])
    test[cat] = test[cat].map(lambda s: -1 if s not in le.classes_ else s)
    le.classes_ = np.append(le.classes_, -1)
    train[cat] = le.transform(train[cat])
    test[cat] = le.transform(test[cat])

In [5]:
sub = pd.read_csv('sample_submission.csv')
sub['id'] = test['id']
test = test.drop('id', axis = 1)

In [None]:
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

def objective(trial, data=train, target=target):

    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25, stratify = target, shuffle = True, random_state=2021)
    param = {
        'max_iter' : 2000,
        'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes', 1, 300),
        'max_depth': trial.suggest_categorical('max_depth', [3,9, 30,80, 100,200, 300]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 30),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1E-2, 1.0),
        'verbose' : 0,
        'random_state' : 2021,
        'early_stopping' : 50
       # 'categorical_features' : np.array([0, 1, 2,3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18])
    }

    model = model = HistGradientBoostingClassifier(**param)  
    model.fit(train_x,train_y)

    preds = model.predict_proba(test_x)[:, 1]    
    roc = roc_auc_score(test_y, preds)
    
    return roc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Best trial:', study.best_params)

[32m[I 2021-03-04 23:20:25,311][0m A new study created in memory with name: no-name-888e18f0-cee7-49ca-8cd6-3100ea0c2a43[0m
[32m[I 2021-03-04 23:21:13,508][0m Trial 0 finished with value: 0.8919592439864159 and parameters: {'max_leaf_nodes': 287, 'max_depth': 100, 'min_samples_leaf': 20, 'learning_rate': 0.04403914176436581}. Best is trial 0 with value: 0.8919592439864159.[0m
[32m[I 2021-03-04 23:21:19,452][0m Trial 1 finished with value: 0.8888102582154764 and parameters: {'max_leaf_nodes': 117, 'max_depth': 9, 'min_samples_leaf': 10, 'learning_rate': 0.23750329830390532}. Best is trial 0 with value: 0.8919592439864159.[0m
[32m[I 2021-03-04 23:23:12,030][0m Trial 2 finished with value: 0.8901752139355632 and parameters: {'max_leaf_nodes': 36, 'max_depth': 100, 'min_samples_leaf': 8, 'learning_rate': 0.00528803458935933}. Best is trial 0 with value: 0.8919592439864159.[0m
[32m[I 2021-03-04 23:24:11,792][0m Trial 3 finished with value: 0.89196000088655 and parameters: {'ma

[32m[I 2021-03-05 00:40:21,734][0m Trial 32 finished with value: 0.892403860272857 and parameters: {'max_leaf_nodes': 88, 'max_depth': 100, 'min_samples_leaf': 30, 'learning_rate': 0.013328529360089575}. Best is trial 21 with value: 0.892728806904664.[0m
[32m[I 2021-03-05 00:44:12,109][0m Trial 33 finished with value: 0.8923470854585674 and parameters: {'max_leaf_nodes': 208, 'max_depth': 100, 'min_samples_leaf': 25, 'learning_rate': 0.009976878348901284}. Best is trial 21 with value: 0.892728806904664.[0m
[32m[I 2021-03-05 00:46:21,870][0m Trial 34 finished with value: 0.8924630062206189 and parameters: {'max_leaf_nodes': 162, 'max_depth': 100, 'min_samples_leaf': 28, 'learning_rate': 0.016568017546797348}. Best is trial 21 with value: 0.892728806904664.[0m
[32m[I 2021-03-05 00:52:38,662][0m Trial 35 finished with value: 0.8910622351548567 and parameters: {'max_leaf_nodes': 133, 'max_depth': 100, 'min_samples_leaf': 22, 'learning_rate': 0.0030791972052619005}. Best is trial

In [None]:
# Trial 21 finished with value: 0.892728806904664 and parameters: {'max_leaf_nodes': 222, 'max_depth': 200, 'min_samples_leaf': 23, 'learning_rate': 0.01904032560385671}

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

num_folds = 10
train_oof_hgbc = np.zeros(train.shape[0])   
test_preds_hgbc = 0
auc_hgbc = []

kf = StratifiedKFold(n_splits =num_folds, shuffle = True, random_state = 2021 )

for f, (train_idx, val_idx) in  enumerate(kf.split(train, target)) :
    train_df, val_df = train.iloc[train_idx], train.iloc[val_idx]
    train_t, val_t = target.iloc[train_idx], target.iloc[val_idx]
    
    param = {
        'max_iter' : 2000, 'verbose' : 0,
        'random_state' : 2021,
        'early_stopping' : 50, 'max_leaf_nodes': 222, 'max_depth': 200, 'min_samples_leaf': 23, 
        'learning_rate': 0.01904032560385671

    }
    
    model = HistGradientBoostingClassifier(**param)  
    model.fit(train_df,train_t)
    temp_oof = model.predict_proba(val_df)[:, 1]
    temp_test = model.predict_proba(test)[:, 1]
    train_oof_hgbc[val_idx] = temp_oof
    test_preds_hgbc += temp_test / num_folds
    auc_hgbc.append(roc_auc_score(val_t, model.predict_proba(val_df)[:, 1])) 


In [8]:
np.mean(auc_hgbc)

0.891835760714822

In [9]:
roc_auc_score(target, train_oof_hgbc)

0.8918073775842985