In [2]:
import numpy as np
import pandas as pd
import gc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import xgboost as xgb
from pytorch_tabnet.tab_model import TabNetClassifier
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

from functools import partial

from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
from skopt.plots import plot_convergence

from bo_parameters import *

In [3]:
lgbm_tuned = {
    "learning_rate" : 0.10284216487315759, 
    "max_depth" : 4, 
    "n_estimators" : 955
}
# lgbm_tuned += LIGHTGBM_PARAMS

xgb_tuned = {
    "learning_rate" : 0.3317433223693123, 
    "max_depth" : 4, 
    "n_estimators" : 772
}
# xgb_tuned += XGBOOST_PARAMS

tabnet_tuned = {
    "gamma" : 1.0131059206061017, 
    "lambda_sparse" : 0.0051151725754103195, 
    "n_steps" : 10,
    "n_a" : 54,
}
tabnet_tuned["n_d"] = tabnet_tuned["n_a"]

In [4]:
data = pd.read_csv("data/census_income/train.csv")

In [5]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,TARGET
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


In [26]:
categorical_idx = [1, 3, 5, 6, 7 ,8 , 9, 13]
categorical_dims = [data[str(idx)].nunique() for idx in categorical_idx]

In [7]:
X = data.drop(["TARGET"], axis=1)
y = data["TARGET"]

In [8]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True)

In [20]:
model_lgbm = LGBMClassifier(**{**lgbm_tuned, **LIGHTGBM_PARAMS})
model_lgbm.fit(train_X, train_y, verbose=1, categorical_feature=categorical)



LGBMClassifier(learning_rate=0.10284216487315759, max_depth=4, metric='auc',
               n_estimators=955, objective='binary', random_state=42)

In [21]:
model_xgb = XGBClassifier(**{**xgb_tuned, **XGBOOST_PARAMS})
model_xgb.fit(train_X, train_y, verbose=1)



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3317433223693123, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=772, n_jobs=-1, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [12]:
TABNET_PARAMS["verbose"] = 1

In [28]:
model_tabnet = TabNetClassifier(**{**tabnet_tuned, **TABNET_PARAMS}, cat_idxs=categorical_idx, cat_dims=categorical_dims)
model_tabnet.fit(train_X.values, train_y.values, max_epochs=30)

Device used : cpu
No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 0.6656  |  0:00:12s
epoch 1  | loss: 0.42369 |  0:00:25s
epoch 2  | loss: 0.38812 |  0:00:38s
epoch 3  | loss: 0.35644 |  0:00:51s
epoch 4  | loss: 0.35112 |  0:01:04s
epoch 5  | loss: 0.33862 |  0:01:16s
epoch 6  | loss: 0.3417  |  0:01:29s
epoch 7  | loss: 0.33436 |  0:01:42s
epoch 8  | loss: 0.33178 |  0:01:55s
epoch 9  | loss: 0.3315  |  0:02:08s
epoch 10 | loss: 0.32561 |  0:02:21s
epoch 11 | loss: 0.33353 |  0:02:33s
epoch 12 | loss: 0.32694 |  0:02:46s
epoch 13 | loss: 0.32329 |  0:02:59s
epoch 14 | loss: 0.32514 |  0:03:11s
epoch 15 | loss: 0.32218 |  0:03:24s
epoch 16 | loss: 0.32521 |  0:03:37s
epoch 17 | loss: 0.32119 |  0:03:50s
epoch 18 | loss: 0.32064 |  0:04:07s
epoch 19 | loss: 0.31872 |  0:04:19s
epoch 20 | loss: 0.3185  |  0:04:32s
epoch 21 | loss: 0.31767 |  0:04:45s
epoch 22 | loss: 0.31821 |  0:04:57s
epoch 23 | loss: 0.32556 |  0:05:10s
epoch 24 | loss: 0.31

In [60]:
print("LightGBM auc: ", round(roc_auc_score(test_y, model_lgbm.predict_proba(test_X)[:, 1]), 6))
print("XGBoost auc: ", round(roc_auc_score(test_y, model_xgb.predict_proba(test_X)[:, 1]), 6))
print("TabNet auc: ", round(roc_auc_score(test_y, model_tabnet.predict_proba(test_X.values)[:, 1]), 6))

LightGBM auc:  0.928408
XGBoost auc:  0.920312
TabNet auc:  0.914119


In [34]:
model_lgbm.feature_importances_

array([2468,  327, 3060,  497,  969,  232,  911,  220,   78,  206,  970,
        787, 1575,  453], dtype=int32)

In [31]:
model_xgb.get_booster().get_score(importance_type="gain")

{'7': 28.159757582345208,
 '4': 10.709002934632903,
 '10': 11.213742910984683,
 '0': 3.048914618211433,
 '11': 5.690265756699691,
 '2': 1.488682435981519,
 '12': 2.523064781128803,
 '5': 8.726319784934647,
 '6': 2.5358263473938125,
 '9': 3.2637663456734938,
 '1': 1.8492987296848824,
 '3': 1.4335334825310921,
 '8': 1.6980304298327267,
 '13': 1.138773708076229}

In [33]:
model_xgb.feature_importances_

array([0.03652314, 0.02215287, 0.01783302, 0.01717239, 0.12828383,
       0.10453314, 0.03037683, 0.33732754, 0.02034082, 0.03909687,
       0.13433015, 0.06816406, 0.03022396, 0.01364144], dtype=float32)

In [35]:
model_tabnet.feature_importances_

array([0.11711631, 0.04215368, 0.03694978, 0.04146911, 0.13953264,
       0.07028268, 0.07225879, 0.10123563, 0.02899957, 0.08616419,
       0.07761269, 0.054379  , 0.10391032, 0.02793562])

In [37]:
importance_lgbm = list(enumerate(model_lgbm.feature_importances_))
importance_xgb = list(enumerate(model_xgb.feature_importances_))
importance_tabnet = list(enumerate(model_tabnet.feature_importances_))

In [39]:
importance_lgbm = sorted(importance_lgbm, key=lambda x: x[1], reverse=True)
importance_xgb = sorted(importance_xgb, key=lambda x: x[1], reverse=True)
importance_tabnet = sorted(importance_tabnet, key=lambda x: x[1], reverse=True)

In [56]:
df_compare = pd.DataFrame([x[0] for x in importance_lgbm], columns=["LightGBM"])
df_compare["XGBoost"] = [x[0] for x in importance_xgb]
df_compare["TabNet"] = [x[0] for x in importance_tabnet]

In [57]:
df_compare

Unnamed: 0,LightGBM,XGBoost,TabNet
0,2,7,4
1,0,10,0
2,12,4,12
3,10,5,7
4,4,11,9
5,6,9,10
6,11,0,6
7,3,6,5
8,13,12,11
9,1,1,1


In [51]:
[x[0] for x in importance_xgb]

[7, 10, 4, 5, 11, 9, 0, 6, 12, 1, 8, 2, 3, 13]

In [52]:
[x[0] for x in importance_lgbm]

[2, 0, 12, 10, 4, 6, 11, 3, 13, 1, 5, 7, 9, 8]

In [45]:
print("LGBM      XGB      TABNET")
for i in range(len(importance_lgbm)):
    print(importance_lgbm[i][0], "    ",  importance_xgb[i][0], "    ", importance_tabnet[i][0], "    ")

LGBM      XGB      TABNET
2      7      4     
0      10      0     
12      4      12     
10      5      7     
4      11      9     
6      9      10     
11      0      6     
3      6      5     
13      12      11     
1      1      1     
5      8      3     
7      2      2     
9      3      8     
8      13      13     
