In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
cont_features = [
    "cont0", "cont1", "cont2", "cont3", "cont4", "cont5", "cont6", "cont7",
    "cont8", "cont9", "cont10",
]
cat_features = [
    "cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7",
    "cat8", "cat9", "cat10", "cat11", "cat12", "cat13", "cat14", "cat15",
    "cat16", "cat17", "cat18"
]
target = train["target"]

In [4]:
from category_encoders import CatBoostEncoder, LeaveOneOutEncoder

xgb_cat_features = []
loo_features = []

def loo_encode(train, test, column):
    loo = LeaveOneOutEncoder()
    new_feature = "{}_loo".format(column)
    loo.fit(train[column], train['target']) 
    train[new_feature] = loo.transform(train[column])
    test[new_feature] = loo.transform(test[column])
    return new_feature

for feature in cat_features:
    loo_features.append(loo_encode(train, test, feature))
    
xgb_cat_features.extend(loo_features)
xgb_features = xgb_cat_features + cont_features



In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

def objective(trial, data=train[xgb_features], target=target):

    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25, stratify = target, shuffle = True, random_state=2021)
    param = {
        'n_estimators': 2000,
       # 'num_leaves': trial.suggest_int('num_leaves', 2, 500),
        'max_depth': trial.suggest_categorical('max_depth', [5,20, 30,50]),
        'alpha': trial.suggest_float('reg_alpha', 1E-5, 100),
        'lambda': trial.suggest_float('reg_lambda', 1E-5, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 1E-3, 1.0),
        #'subsample': trial.suggest_float('subsample ', 1E-16, 0.9),
        #'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100),  
        'subsample': trial.suggest_categorical('subsample', [0.2,0.3,0.4,0.5,0.6,0.7,0.8,1.0]),
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1E-3, 10),
        #'subsample_freq': trial.suggest_int('subsample_freq', 0, 20),
        'eval_metric': 'auc', 
        'random_state': 2021,
        'learning_rate': trial.suggest_loguniform('learning_rate', 8E-3, 1.0), 
       # 'enable_categorical' : True
    }

    model = XGBClassifier(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=50,verbose=False)

    preds = model.predict_proba(test_x)[:, 1]    
    roc = roc_auc_score(test_y, preds)
    
    return roc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print('Best trial:', study.best_params)

[32m[I 2021-03-06 14:25:42,564][0m A new study created in memory with name: no-name-71f2d5ee-7f5f-4260-936e-5ae5c45bb707[0m
[32m[I 2021-03-06 14:35:48,526][0m Trial 0 finished with value: 0.8925831209763573 and parameters: {'max_depth': 20, 'reg_alpha': 86.37140270350748, 'reg_lambda': 12.231624641752926, 'colsample_bytree': 0.918324972729146, 'subsample': 0.3, 'min_child_weight': 2.162685667872237, 'learning_rate': 0.03919517116001566}. Best is trial 0 with value: 0.8925831209763573.[0m
[32m[I 2021-03-06 14:42:18,038][0m Trial 1 finished with value: 0.8945380954160269 and parameters: {'max_depth': 20, 'reg_alpha': 27.452878774631323, 'reg_lambda': 46.899148187099314, 'colsample_bytree': 0.128403157794986, 'subsample': 0.5, 'min_child_weight': 0.009489090853903346, 'learning_rate': 0.030875684897131484}. Best is trial 1 with value: 0.8945380954160269.[0m
[32m[I 2021-03-06 14:46:06,153][0m Trial 2 finished with value: 0.8924388767191145 and parameters: {'max_depth': 5, 'reg_a

[32m[I 2021-03-06 17:41:17,526][0m Trial 22 finished with value: 0.8969294465217985 and parameters: {'max_depth': 30, 'reg_alpha': 10.38984401059398, 'reg_lambda': 62.44953086098899, 'colsample_bytree': 0.3731988268527646, 'subsample': 0.7, 'min_child_weight': 0.02299582594147899, 'learning_rate': 0.00955531402498414}. Best is trial 11 with value: 0.8972392807157024.[0m
[32m[I 2021-03-06 17:49:55,366][0m Trial 23 finished with value: 0.8961181591412662 and parameters: {'max_depth': 30, 'reg_alpha': 0.4196578614694131, 'reg_lambda': 76.60952165831857, 'colsample_bytree': 0.520917154518605, 'subsample': 0.8, 'min_child_weight': 0.0010689562345365913, 'learning_rate': 0.05625168095208454}. Best is trial 11 with value: 0.8972392807157024.[0m
[32m[I 2021-03-06 18:10:28,055][0m Trial 24 finished with value: 0.896920653606548 and parameters: {'max_depth': 30, 'reg_alpha': 20.297876795224468, 'reg_lambda': 78.14286391653151, 'colsample_bytree': 0.37230131039416736, 'subsample': 0.7, 'm