In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import time

import catboost as cb
import xgboost as xgb
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

In [10]:
os.listdir("data")

['sample_submission.csv', 'test.csv', 'test_n.csv', 'train.csv', 'train_n.csv']

In [11]:
data = pd.read_csv('data/train_n.csv')
test = pd.read_csv('data/test_n.csv')
# sub = pd.read_csv('data/sample_submission.csv')

data = data.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [12]:
# data = pd.read_csv("data/train_best.csv")
# test = pd.read_csv("data/test_best.csv")


In [13]:
data.columns

Index(['id', 'customerid', 'creditscore', 'age', 'tenure', 'balance',
       'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary',
       'exited', '0', '1', '2', '3', '4', 'le_geography', 'le_gender'],
      dtype='object')

In [14]:
data.shape, test.shape

((165034, 18), (110023, 17))

In [15]:
x_train, y_train = data.drop('exited',axis=1), data['exited']

In [16]:
def bayes_optim(x, y, model, dict_params, scoring='roc_auc', random_state=42, cv=5):

    bayes_cv_tuner = BayesSearchCV(
        estimator = model(
        silent=True
        ),
        search_spaces = dict_params,
        cv=cv,
        scoring = scoring,
        n_jobs = -1,
    #     n_iter = 100,
        verbose = 1,
        refit = True,
        random_state = random_state
        )
    bayes_cv_tuner.fit(x, y)
    print(bayes_cv_tuner.best_params_)


In [17]:
bayes_cv_tuner_xgb = BayesSearchCV(
    estimator = xgb.XGBClassifier(
    # silent=True
    ),
    search_spaces = {
        # "booster": ("gbtree", "gblinear"),
        "n_estimators": (800, 3000),
    "learning_rate": (0.0001, 0.01),
    "gamma" : (0, 30),
    "max_depth": (2, 20),
    "subsample": (0.3, 0.9),
    "min_child_weight": (1, 20),
    "colsample_bytree": (0.1, 0.7),
    "max_delta_step": (1, 20),
#     "colsample_bynode": (0.1, 0.7),
    "lambda": (1, 20),
    "alpha" : (1, 20)
    },
    cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    refit = True,
    random_state = 42
    )
np.int = int
resultXGB = bayes_cv_tuner_xgb.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [18]:
bayes_cv_tuner_xgb.best_params_

OrderedDict([('alpha', 15),
             ('colsample_bytree', 0.663621842561663),
             ('gamma', 5),
             ('lambda', 5),
             ('learning_rate', 0.008024981991421775),
             ('max_delta_step', 8),
             ('max_depth', 10),
             ('min_child_weight', 11),
             ('n_estimators', 2528),
             ('subsample', 0.6344563221826109)])

In [19]:
dict_params = {
    "n_estimators": (500, 2000),
    "max_depth": (3, 15),
    "subsample" : (0.3, 0.7),
#     "leaf_estimation_iterations": (5, 150),
    "colsample_bylevel": (0.3, 0.9),
    "l2_leaf_reg": (2, 30),
    "learning_rate": (0.0001, 0.01),
#     "iterations": (100, 500)
    }
r = bayes_optim(x_train, y_train, cb.CatBoostClassifier, dict_params)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [20]:
bayes_cv_tuner_cb = BayesSearchCV(
    estimator = cb.CatBoostClassifier(
    silent=True
    ),
    search_spaces = {
    "n_estimators": (500, 2000),
    "max_depth": (3, 15),
    "subsample" : (0.3, 0.7),
#     "leaf_estimation_iterations": (5, 150),
    "colsample_bylevel": (0.3, 0.9),
    "l2_leaf_reg": (2, 30),
    "learning_rate": (0.0001, 0.01),
#     "iterations": (100, 500)
    },
    cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    refit = True,
    random_state = 72
    )
# resultCAT = bayes_cv_tuner_cb.fit(x_train, y_train)

In [21]:
resultCAT = bayes_cv_tuner_cb.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

In [None]:
bayes_cv_tuner_cb.best_params_

In [22]:



bayes_cv_tuner_rf = BayesSearchCV(
    estimator = RandomForestClassifier(
#     silent=True
    ),
    search_spaces = {
    "n_estimators": (100, 2000),
    "max_depth" : (2, 20),
    "min_samples_split": (2, 100),
    "min_samples_leaf": (2, 20),
#     "min_sample_split ": (2, 50),
#     "max_features ": ("sqrt", "log2"),
    },
#     cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    # refit = True,
    random_state = 42
    )

In [23]:
np.int = int
resulrf = bayes_cv_tuner_rf.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

In [None]:
bayes_cv_tuner_rf.best_params_

In [24]:
    lgbm_params = {
    'boosting_type': 'gbdt',
    'n_estimators': 750,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 0.7,
    'is_unbalance': False,
    'random_state': 42,
    'silent': -1,
    'verbose': -1
}


bayes_cv_tuner_lgbm = BayesSearchCV(
    estimator = LGBMClassifier(
#     silent=True
    ),
    search_spaces = {
    "learning_rate": (0.0001, 0.1),
    "max_depth" : (2, 20),
    "subsample": (0.4, 0.8),
    "colsample_bytree": (0.001, 0.1),
#     "min_sample_split ": (2, 50),
#     "max_features ": ("sqrt", "log2"),
    },
#     cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    refit = True,
    random_state = 72
    )

In [25]:
resulrf = bayes_cv_tuner_lgbm.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Number of positive: 27937, number of negative: 104090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1379
[LightGBM] [Info] Number of data points in the train set: 132027, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211601 -> initscore=-1.315304
[LightGBM] [Info] Start training from score -1.315304
[LightGBM] [Info] Number of positive: 27937, number of negative: 104090
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1377
[LightGBM] [Info] Number of data points in the tr

KeyboardInterrupt: 

In [None]:
bayes_cv_tuner_lgbm.best_params_