In [1]:
from models import *
from bayes_opt import BayesianOptimization

from sklearn import clone
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgbm
import catboost as cboost

from functools import partial
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
def evaluate_predictions(preds, best_file):
    return roc_auc_score(pd.read_csv(best_file)["risk_flag"].values, preds)

def eval_on_original(model, targets="risk_flag", data_params={}, fit_params={}, par_mod=None, n=1, **params):
    model = clone(model)
    
    if par_mod:
        params = par_mod(params)
    model.set_params(**params)
    
    df_train, df_test = data_preprocess(**data_params)
    
    res = []
    for _ in range(n):
        X, y = df_train.drop(["id", targets], axis=1), df_train[targets].values
        X, y = AllKNN(n_jobs=-1, n_neighbors=20).fit_resample(X, y)
        model.fit(X, y, **fit_params)
    
        preds = model.predict(df_test.drop("id", axis=1))
        res.append(evaluate_predictions(preds, "./BEST.csv"))
        
    res = np.array(res)
    return res.mean()


def tuning_w_original(model, pbounds, targets="risk_flag", data_params={}, fit_params={}, par_mod=None, **opt_kwargs):
    eval_fn = partial(eval_on_original, model, targets, data_params, fit_params, par_mod)
    BO = BayesianOptimization(eval_fn, pbounds)
    BO.maximize(**opt_kwargs)
    return BO

def tuning_super_learner(model, pbounds, fit_params={}, par_mod=None, n=5, **opt_kwargs):
    df_train = pd.read_csv("./ensemble_files/ensemble_train.csv")
    df_test = pd.read_csv("./ensemble_files/ensemble_test.csv")

    def eval_fn(model, df_train, df_test, fit_params, par_mod, **params):
        model = clone(model)
        params = par_mod(params)
        model.set_params(**params)

        res = []
        for _ in range(n):
            model.fit(df_train.drop(["id", "targets"], axis=1), df_train["targets"].values, **fit_params)
            preds = model.predict(df_test.drop("id", axis=1))
            res.append(evaluate_predictions(preds, "./BEST.csv"))
        
        
    
    eval_fn = partial(eval_fn, model, df_train, df_test, fit_params, par_mod)
    superBO = BayesianOptimization(eval_fn, pbounds)
    superBO.maximize(**opt_kwargs)
    return superBO

# Tuning on Original Dataset
---

## XGBoost

In [None]:
def xgb_tuning():
    model = XGBClassifier(
        tree_method="gpu_hist", 
        n_estimators=200,
        eval_metric="logloss",
        learning_rate=0.1,
        max_depth=23,
        min_child_weight=3.481
    )

    pbounds = {
        "gamma": (0, 1),
        "reg_alpha": (-5, 1)
    }
    
    def par_mod(params):
        for par in ["max_depth"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["reg_alpha", "learning_rate"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
    
    data_params = {"dtrain": pd.read_csv("./train.csv"), "dtest": pd.read_csv("./test.csv")}
    return tuning_w_original(model, pbounds, par_mod=par_mod, data_params=data_params, init_points=20, n_iter=40)

xgbBO = xgb_tuning()

In [None]:
xgbBO.max["params"]

# LightGBM

In [None]:
def lgbm_tuning(model=None, pbounds=None, init_params=None):
    if model is None:
        model = LGBMClassifier(
            learning_rate=0.1,
            n_estimators=200,
            device="gpu",
            num_leaves=256,
            boosting_type="goss",
            max_depth=23,
            max_bin=64,
            min_child_weight=1.899,
            verbose=-1
        )
    
    def par_mod(params):
        for par in ["max_depth", "max_bin", "num_leaves", "bagging_freq"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["lambda_l1", "lambda_l2", "learning_rate"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
    
    if init_params is not None:
        model.set_params(**par_mod(init_params))

    if pbounds is None:
        pbounds = {
            "lambda_l1": (-5, 1),
            "lambda_l2": (-5, 1)
        }
    
    data_params = {"dtrain": pd.read_csv("./train.csv"), "dtest": pd.read_csv("./test.csv")}
    return tuning_w_original(model, pbounds, par_mod=par_mod, data_params=data_params, init_points=20, n_iter=40)

lgbBO = lgbm_tuning()

## CatBoost

In [None]:
def catboost_tuning():
    cat_features = ["house_ownership", "car_ownership", "married", "city", "profession", "state", "age", "experience", "income"]
    model = CatBoostClassifier(
        learning_rate=0.1,
        n_estimators=200,
        loss_function="Logloss",
        boosting_type="Ordered",
        eval_metric="AUC",
        cat_features=cat_features,
        task_type="GPU",
        max_depth=5,
        auto_class_weights="Balanced",
        verbose=False
    )
    
    def par_mod(params):
        for par in ["max_depth", "max_bin"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["l2_leaf_reg"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
   
    pbounds = {
#         "random_strength": (0, 20),
#         "subsample": (0.1, 1),
#         "scale_pos_weight": (5, 20),
#         "l2_leaf_reg": (-5, 1),
#         "bagging_temperature": (0.1, 1)
    }
    
    
    data_params = {"drop_cols": {}, "transforms": [(catboost_dataset, {"cols": cat_features})], "no_encode": True}
    return tuning_w_original(model, pbounds, par_mod=par_mod, data_params=data_params, init_points=20, n_iter=40)
cboostBO = catboost_tuning()

In [None]:
# from imblearn.ensemble import BalancedRandomForestClassifier
model = XGBClassifier(
        tree_method="gpu_hist", 
        n_estimators=3000,
        eval_metric="logloss",
        learning_rate=0.1,
        max_depth=23,
        min_child_weight=3.481,
        gamma=0,
        reg_alpha=10 ** -3.4825
    )
print(eval_on_original(model, data_params={"dtrain": pd.read_csv("./train.csv"), "dtest": pd.read_csv("./test.csv")}))

In [3]:
params = {
    "tree_method": "gpu_hist", 
    "eval_metric": "logloss",
    "learning_rate": 0.001,
    "max_depth": 23,
    "min_child_weight": 3.481,
    "gamma": 0,
    "reg_alpha": 10 ** -3.4825
}

df_train = data_preprocess(pd.read_csv("./train.csv"))
X, y = df_train.drop(["id", "risk_flag"], axis=1), df_train["risk_flag"].values
X, y = AllKNN(n_jobs=-1, n_neighbors=20).fit_resample(X, y)
dtrain = xgb.DMatrix(data=X, label=y)
res = xgb.cv(params, dtrain, 10000, metrics="auc", nfold=5, stratified=True, verbose_eval=True)

UnboundLocalError: local variable 'df_test' referenced before assignment

# Super Learner

In [None]:
def super_tuning():
    model = XGBClassifier(
        tree_method="gpu_hist",
        n_estimators=200,
        learning_rate=0.1,
        scale_pos_weight=7.133,
        eval_metric="logloss", 
    )

    def par_mod(params):
        for par in ["max_depth"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["reg_alpha", "learning_rate"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
    
    pbounds = {
        "max_depth": (2, 25),
        "min_child_weight": (0, 20)
    }

    return tuning_super_learner(model, pbounds, par_mod=par_mod, n=5, init_points=20, n_iter=40)
superBO = super_tuning()

---
# Model Evaluation
---

In [None]:
from models import MODEL_LIST, TRANSFORM_LIST

# model_name = "xgb"
# model = MODEL_LIST[model_name]["model"]
# model.set_params(**MODEL_LIST[model_name]["params"])
# data_params = TRANSFORM_LIST[model_name]

# model.set_params(n_estimators=1000, learning_rate=0.1)

model = LGBMClassifier(
            learning_rate=0.05,
            n_estimators=2000,
            device="gpu",
            num_leaves=2 ** 16,
            boosting_type="goss",
            max_depth=23,
            max_bin=64,
            min_child_weight=1.899,
            lambda_l1=10**-3.565,
            lambda_l2=10**-2.953,
            verbose=-1
        )
data_params = {"dtrain": pd.read_csv("./train.csv"), "dtest": pd.read_csv("./test.csv")}
print(eval_on_original(model,  targets="risk_flag", data_params=data_params, fit_params={}, n=3))

- 

In [None]:
10 **-3.565

# Final Parameters
---
# XGBoost

```python
model = XGBClassifier(
    tree_method="gpu_hist", 
    learning_rate=0.00494, 
    n_estimators=10000, 
    eval_metric="logloss", 
    scale_pos_weight=19.35097,
    max_depth=8,
    min_child_weight=4.525,
    gamma=0.938358
)
```
---
# LightGBM

```python
model = LGBMClassifier(
    learning_rate=0.005,
    n_estimators=10000,
    device="gpu",
    boosting_type="goss",
    num_leaves=229,
    scale_pos_weight=13.56929,
    colsample_bytree=0.95781,
    subsample=0.74331,
    max_depth=7,
    min_child_weight=7.5546,
    max_bin=96,
    lambda_l1=0.0868,
    lambda_l2=0.01541,
    verbose=-1
)
```
---
# CatBoost

```python
cat_features = ["house_ownership", "car_ownership", "married", "city", "profession", "state", "age", "experience", "income"]
model = CatBoostClassifier(
    learning_rate=0.1,
    n_estimators=200,
    loss_function="Logloss",
    boosting_type="Ordered",
    eval_metric="AUC",
    cat_features=cat_features,
    max_depth=5,
    task_type="GPU",
    auto_class_weights="Balanced",
    verbose=False
)

data_params = {"drop_cols": {}, "transforms": [(catboost_dataset, {"cols": cat_features})], "no_encode": True}
```

In [None]:
from models import *
from imblearn.under_sampling import TomekLinks

model =  XGBClassifier(
                    n_estimators=3000,
                    learning_rate=0.01,
                    eval_metric="logloss",
                    tree_method="gpu_hist",
                    # gamma=0.9384,
                    # max_depth=8,
                    # min_child_weight=4.525
                )
def lgbv1_transform_fn(df, target_name="risk_flag"):
    sep = TomekLinks(sampling_strategy="not minority", n_jobs=-1)
    # under = InstanceHardnessThreshold(sampling_strategy=1, n_jobs=-1)
    sampler = Pipeline(steps=[('u1', sep)])
    X, y = df.drop(["id", target_name], axis=1), df[target_name].values
    # X, y = sampler.fit_resample(X, y)
    return X, y

res = []
for _ in range(1):
    dtrain, dtest = data_preprocess(pd.read_csv("./train.csv"), pd.read_csv("./test.csv"))
    Xtrain, ytrain = lgbv1_transform_fn(dtrain)
    model.fit(Xtrain, ytrain)
    preds = model.predict(dtest.drop("id", axis=1))
    res.append(evaluate_predictions(preds, "./BEST.csv"))
res = np.array(res)
print([res.mean(), res.std()])

- 0.6753484121544437