In [16]:
from transforms import *
from main import data_preprocess, SINGLE_TRANSFORMS
from bayes_opt import BayesianOptimization

from sklearn import clone
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgbm
import catboost as cboost

from functools import partial

import warnings
warnings.filterwarnings('ignore')

In [2]:
def evaluate_predictions(preds, best_file):
    return roc_auc_score(pd.read_csv(best_file)["risk_flag"].values, preds)

def eval_on_original(model, targets="risk_flag", data_params={}, fit_params={}, par_mod=None, n=1, **params):
    model = clone(model)
    
    if par_mod:
        params = par_mod(params)
    model.set_params(**params)
    
    df_train, df_test = data_preprocess(**data_params)
    
    res = []
    for _ in range(n):
        X, y = df_train.drop(["id", targets], axis=1), df_train[targets].values
        model.fit(X, y, **fit_params)
    
        preds = model.predict(df_test.drop("id", axis=1))
        res.append(evaluate_predictions(preds, "./BEST.csv"))
        
    return sum(res) / len(res)


def tuning_w_original(model, pbounds, targets="risk_flag", data_params={}, fit_params={}, par_mod=None, **opt_kwargs):
    eval_fn = partial(eval_on_original, model, targets, data_params, fit_params, par_mod)
    BO = BayesianOptimization(eval_fn, pbounds)
    BO.maximize(**opt_kwargs)
    return BO

def tuning_super_learner(model, pbounds, fit_params={}, par_mod=None, n=5, **opt_kwargs):
    df_train = pd.read_csv("./ensemble_files/ensemble_train.csv")
    df_test = pd.read_csv("./ensemble_files/ensemble_test.csv")

    def eval_fn(model, df_train, df_test, fit_params, par_mod, **params):
        model = clone(model)
        params = par_mod(params)
        model.set_params(**params)

        res = []
        for _ in range(n):
            model.fit(df_train.drop(["id", "targets"], axis=1), df_train["targets"].values, **fit_params)
            preds = model.predict(df_test.drop("id", axis=1))
            res.append(evaluate_predictions(preds, "./BEST.csv"))
        
        return sum(res)/len(res)
    
    eval_fn = partial(eval_fn, model, df_train, df_test, fit_params, par_mod)
    superBO = BayesianOptimization(eval_fn, pbounds)
    superBO.maximize(**opt_kwargs)
    return superBO

# Tuning on Original Dataset
---

## XGBoost

In [None]:
def xgb_tuning():
    model = XGBClassifier(
        tree_method="gpu_hist", 
        n_estimators=10000,
        eval_metric="logloss", 
        scale_pos_weight=19.35097,
        max_depth=8,
        min_child_weight=4.525,
        gamma=0.938358
    )
    
    pbounds = {
        "learning_rate": (-4, -2),
    }
    
    def par_mod(params):
        for par in ["max_depth"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["reg_alpha", "learning_rate"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
    
    return tuning_w_original(model, pbounds, par_mod=par_mod, init_points=20, n_iter=40)

xgbBO = xgb_tuning()

# LightGBM

In [None]:
def lgbm_tuning():
    model = LGBMClassifier(
        learning_rate=0.1,
        n_estimators=200,
        device="gpu",
        boosting_type="goss",
        num_leaves=229,
        max_depth=7,
        min_child_weight=7.5546,
        max_bin=96,
        scale_pos_weight=13.56929,
        colsample_bytree=0.95781,
        subsample=0.74331,
        verbose=-1
    )
    
    def par_mod(params):
        for par in ["max_depth", "max_bin", "num_leaves", "bagging_freq"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["lambda_l1", "lambda_l2", "learning_rate"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
    
    pbounds = {
        "lambda_l1": (-5, 1),
        "lambda_l2": (-5, 1),
#         "reg_alpha": (-5, 1)
    }
    
    return tuning_w_original(model, pbounds, par_mod=par_mod, init_points=20, n_iter=40)
lgbmBO = lgbm_tuning()

## CatBoost

In [None]:
def catboost_tuning():
    cat_features = ["house_ownership", "car_ownership", "married", "city", "profession", "state", "age", "experience", "income"]
    model = CatBoostClassifier(
        learning_rate=0.1,
        n_estimators=200,
        loss_function="Logloss",
        boosting_type="Ordered",
        eval_metric="AUC",
        cat_features=cat_features,
        task_type="GPU",
        max_depth=5,
        auto_class_weights="Balanced",
        verbose=False
    )
    
    def par_mod(params):
        for par in ["max_depth", "max_bin"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["l2_leaf_reg"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
   
    pbounds = {
#         "random_strength": (0, 20),
#         "subsample": (0.1, 1),
#         "scale_pos_weight": (5, 20),
#         "l2_leaf_reg": (-5, 1),
#         "bagging_temperature": (0.1, 1)
    }
    
    
    data_params = {"drop_cols": {}, "transforms": [(catboost_dataset, {"cols": cat_features})], "no_encode": True}
    return tuning_w_original(model, pbounds, par_mod=par_mod, data_params=data_params, init_points=20, n_iter=40)
cboostBO = catboost_tuning()

In [8]:
from imblearn.ensemble import BalancedRandomForestClassifier
clf = BalancedRandomForestClassifier(n_estimators=500, n_jobs=-1)
print(eval_on_original(clf))

0.8200822481151473


# Super Learner

In [None]:
def super_tuning():
    model = XGBClassifier(
        tree_method="gpu_hist",
        n_estimators=200,
        learning_rate=0.1,
        scale_pos_weight=7.133,
        eval_metric="logloss", 
    )

    def par_mod(params):
        for par in ["max_depth"]:
            if par in params:
                params[par] = int(round(params[par]))
        for par in ["reg_alpha", "learning_rate"]:
            if par in params:
                params[par] = 10 ** params[par]
        return params
    
    pbounds = {
        "max_depth": (2, 25),
        "min_child_weight": (0, 20)
    }

    return tuning_super_learner(model, pbounds, par_mod=par_mod, n=5, init_points=20, n_iter=40)
superBO = super_tuning()

---
# Model Evaluation
---

In [6]:
from models import MODEL_LIST, TRANSFORM_LIST

# model_name = "xgb"
# model = MODEL_LIST[model_name]["model"]
# model.set_params(**MODEL_LIST[model_name]["params"])
# data_params = TRANSFORM_LIST[model_name]

# model.set_params(n_estimators=1000, learning_rate=0.1)


print(eval_on_original(model,  targets="risk_flag", data_params=data_params, fit_params={}, n=5))

0.876288195422061


- 0.7725611149188942 (RF)
- 0.8480793898917687 (LGBM)
- 0.9994960751423004
- 0.9996103086355607

# Final Parameters
---
# XGBoost

```python
model = XGBClassifier(
    tree_method="gpu_hist", 
    learning_rate=0.00494, 
    n_estimators=10000, 
    eval_metric="logloss", 
    scale_pos_weight=19.35097,
    max_depth=8,
    min_child_weight=4.525,
    gamma=0.938358
)
```
---
# LightGBM

```python
model = LGBMClassifier(
    learning_rate=0.005,
    n_estimators=10000,
    device="gpu",
    boosting_type="goss",
    num_leaves=229,
    scale_pos_weight=13.56929,
    colsample_bytree=0.95781,
    subsample=0.74331,
    max_depth=7,
    min_child_weight=7.5546,
    max_bin=96,
    lambda_l1=0.0868,
    lambda_l2=0.01541,
    verbose=-1
)
```
---
# CatBoost

```python
cat_features = ["house_ownership", "car_ownership", "married", "city", "profession", "state", "age", "experience", "income"]
model = CatBoostClassifier(
    learning_rate=0.1,
    n_estimators=200,
    loss_function="Logloss",
    boosting_type="Ordered",
    eval_metric="AUC",
    cat_features=cat_features,
    max_depth=5,
    task_type="GPU",
    auto_class_weights="Balanced",
    verbose=False
)

data_params = {"drop_cols": {}, "transforms": [(catboost_dataset, {"cols": cat_features})], "no_encode": True}
```

In [17]:
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from imblearn.pipeline import Pipeline

# model = XGBClassifier(
#     tree_method="gpu_hist", 
#     learning_rate=0.1, 
#     n_estimators=500, 
#     eval_metric="logloss", 
#     max_depth=8,
#     min_child_weight=4.525,
#     gamma=0.938358
# )
model = RandomForestClassifier(n_estimators=500, n_jobs=-1, max_features=1)
df_train, df_test = data_preprocess()

under = InstanceHardnessThreshold(sampling_strategy=1, n_jobs=-1)
over = SMOTE(sampling_strategy=0.75, k_neighbors=5)
sampler = Pipeline(steps=[('o', over), ('u', under)])

res = []
for _ in range(1):
    X, y = sampler.fit_resample(df_train.drop(["id", "risk_flag"], axis=1), df_train["risk_flag"].values)
    model.fit(X, y)
    preds = model.predict(df_test.drop("id", axis=1))
    res.append(evaluate_predictions(preds, "./BEST.csv"))
res = np.array(res)
print([res.mean(), res.std()])

[0.9934563320481204, 0.00027806230471899107]


## Undersampling Only
---
- 0.868648 (no sampling)
- 0.718138 (0.5)
- 0.780676 (0.8)
- 0.798733 (1.0)

## Oversampling Only
---
- 0.776018 (`sampling_strategy`=0.5)
- 0.794863 (`sampling_strategy`=0.8)
- 0.858355 (`sampling_strategy`=0.75, `k_neighbors`=5) (`InstanceHardnessThreshold`)