In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import time

import catboost as cb
import xgboost as xgb
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

In [2]:
os.listdir("data")

['data_kmeans.csv',
 'sample_submission.csv',
 'test.csv',
 'test_best.csv',
 'test_dr_f.csv',
 'test_kmeans.csv',
 'test_max_sign.csv',
 'test_new.csv',
 'train.csv',
 'train_best.csv',
 'train_dr_f.csv',
 'train_kmeans.csv',
 'train_max_sign.csv',
 'train_new.csv']

In [5]:
data = pd.read_csv('data/train_oof.csv')
test = pd.read_csv('data/test_oof.csv')
sub = pd.read_csv('data/sample_submission.csv')

data = data.drop('Unnamed: 0', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [3]:
# data = pd.read_csv("data/train_best.csv")
# test = pd.read_csv("data/test_best.csv")


In [6]:
data.shape, test.shape

((7500, 10), (2500, 9))

In [7]:
x_train, y_train = data.drop("target",axis=1), data['target']

In [6]:
def bayes_optim(x, y, model, dict_params, scoring='roc_auc', random_state=42, cv=5):

    bayes_cv_tuner = BayesSearchCV(
        estimator = model(
        silent=True
        ),
        search_spaces = dict_params,
        cv=cv,
        scoring = scoring,
        n_jobs = -1,
    #     n_iter = 100,
        verbose = 1,
        refit = True,
        random_state = random_state
        )
    bayes_cv_tuner.fit(x, y)
    print(bayes_cv_tuner.best_params_)


In [7]:
bayes_cv_tuner_xgb = BayesSearchCV(
    estimator = xgb.XGBClassifier(
    # silent=True
    ),
    search_spaces = {
        # "booster": ("gbtree", "gblinear"),
        "n_estimators": (800, 3000),
    "learning_rate": (0.0001, 0.01),
    "gamma" : (0, 30),
    "max_depth": (2, 20),
    "subsample": (0.3, 0.9),
    "min_child_weight": (1, 20),
    "colsample_bytree": (0.1, 0.7),
    "max_delta_step": (1, 20),
#     "colsample_bynode": (0.1, 0.7),
    "lambda": (1, 20),
    "alpha" : (1, 20)
    },
    cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    refit = True,
    random_state = 42
    )
np.int = int
resultXGB = bayes_cv_tuner_xgb.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [8]:
bayes_cv_tuner_xgb.best_params_

OrderedDict([('alpha', 1),
             ('colsample_bytree', 0.7),
             ('gamma', 5),
             ('lambda', 1),
             ('learning_rate', 0.003419782719339618),
             ('max_delta_step', 3),
             ('max_depth', 8),
             ('min_child_weight', 2),
             ('n_estimators', 2466),
             ('subsample', 0.31098721316506717)])

In [16]:
dict_params = {
    "n_estimators": (500, 2000),
    "max_depth": (3, 15),
    "subsample" : (0.3, 0.7),
#     "leaf_estimation_iterations": (5, 150),
    "colsample_bylevel": (0.3, 0.9),
    "l2_leaf_reg": (2, 30),
    "learning_rate": (0.0001, 0.01),
#     "iterations": (100, 500)
    }
r = bayes_optim(x_train, y_train, cb.CatBoostClassifier, dict_params)

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [7]:
bayes_cv_tuner_cb = BayesSearchCV(
    estimator = cb.CatBoostClassifier(
    silent=True
    ),
    search_spaces = {
    "n_estimators": (500, 2000),
    "max_depth": (3, 15),
    "subsample" : (0.3, 0.7),
#     "leaf_estimation_iterations": (5, 150),
    "colsample_bylevel": (0.3, 0.9),
    "l2_leaf_reg": (2, 30),
    "learning_rate": (0.0001, 0.01),
#     "iterations": (100, 500)
    },
    cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    refit = True,
    random_state = 72
    )
# resultCAT = bayes_cv_tuner_cb.fit(x_train, y_train)

In [8]:
resultCAT = bayes_cv_tuner_cb.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [9]:
bayes_cv_tuner_cb.best_params_

OrderedDict([('colsample_bylevel', 0.5853177112754828),
             ('l2_leaf_reg', 22),
             ('learning_rate', 0.009906548674016816),
             ('max_depth', 3),
             ('n_estimators', 1993),
             ('subsample', 0.5000411100811685)])

OrderedDict([('alpha', 2),
             ('colsample_bytree', 0.5528885130826658),
             ('eta', 0.009432219749289407),
             ('gamma', 2),
             ('lambda', 4),
             ('max_depth', 19),
             ('subsample', 0.8217302497223249)])

In [10]:



bayes_cv_tuner_rf = BayesSearchCV(
    estimator = RandomForestClassifier(
#     silent=True
    ),
    search_spaces = {
    "n_estimators": (100, 2000),
    "max_depth" : (2, 20),
    "min_samples_split": (2, 100),
    "min_samples_leaf": (2, 20),
#     "min_sample_split ": (2, 50),
#     "max_features ": ("sqrt", "log2"),
    },
#     cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    # refit = True,
    random_state = 42
    )

In [None]:
np.int = int
resulrf = bayes_cv_tuner_rf.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
bayes_cv_tuner_rf.best_params_

In [18]:
    lgbm_params = {
    'boosting_type': 'gbdt',
    'n_estimators': 750,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 0.7,
    'is_unbalance': False,
    'random_state': 42,
    'silent': -1,
    'verbose': -1
}


bayes_cv_tuner_lgbm = BayesSearchCV(
    estimator = LGBMClassifier(
#     silent=True
    ),
    search_spaces = {
    "learning_rate": (0.0001, 0.1),
    "max_depth" : (2, 20),
    "subsample": (0.4, 0.8),
    "colsample_bytree": (0.001, 0.1),
#     "min_sample_split ": (2, 50),
#     "max_features ": ("sqrt", "log2"),
    },
#     cv=5,
    scoring = 'roc_auc',
    n_jobs = 1,
#     n_iter = 100,
    verbose = 1,
    refit = True,
    random_state = 72
    )

In [19]:
resulrf = bayes_cv_tuner_lgbm.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [20]:
bayes_cv_tuner_lgbm.best_params_

OrderedDict([('colsample_bytree', 0.0626668954124053),
             ('learning_rate', 0.1),
             ('max_depth', 2),
             ('subsample', 0.7789842765159666)])