In [7]:
#  Libraries
import tqdm
import numpy as np 
import pandas as pd 
# Data processing, metrics and modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold,KFold
from bayes_opt import BayesianOptimization
from datetime import datetime
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve
from sklearn import metrics
from sklearn import preprocessing
# Lgbm
import lightgbm as lgb
# Suppr warning
import warnings
warnings.filterwarnings("ignore")

import itertools
from scipy import interp

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams

In [8]:
def load_data(file):
    return pd.read_csv(file)

test = load_data('test_prepared.csv')
train = load_data('train_prepared.csv')

def nan2mean(df):
    for x in list(df.columns.values):
        df[x] = df[x].fillna(df[x].mean())
    return df

In [9]:
for col in tqdm.tqdm(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))  
        
features = list(train)
features.remove('isFraud')
target = 'isFraud'

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 355/355 [00:29<00:00, 11.89it/s]


In [10]:
train=nan2mean(train)
test=nan2mean(test)

In [11]:
#black box LGBM 
def LGB_bayesian(
    #learning_rate,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_child_weight, 
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda
     ):
    
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    

    param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'min_child_weight': min_child_weight,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'binary',
              'save_binary': True,
              'seed': 4041,
              'feature_fraction_seed': 4041,
              'bagging_seed': 4041,
              'drop_seed': 4041,
              'data_random_seed': 4041,
              'boosting_type': 'gbdt',
              'verbose': 1,
              'is_unbalance': False,
              'boost_from_average': True,
              'metric':'auc'}    
    
    oof = np.zeros(len(train))
    trn_data= lgb.Dataset(train.iloc[bayesian_tr_idx][features].values, label=train.iloc[bayesian_tr_idx][target].values)
    val_data= lgb.Dataset(train.iloc[bayesian_val_idx][features].values, label=train.iloc[bayesian_val_idx][target].values)

    clf = lgb.train(param, trn_data,  num_boost_round=50, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds = 50)
    
    oof[bayesian_val_idx]  = clf.predict(train.iloc[bayesian_val_idx][features].values, num_iteration=clf.best_iteration)  
    
    score = roc_auc_score(train.iloc[bayesian_val_idx][target].values, oof[bayesian_val_idx])

    return score

In [12]:
bayesian_tr_idx, bayesian_val_idx = train_test_split(train, test_size = 0.3, random_state = 42, stratify = train[target])
bayesian_tr_idx = bayesian_tr_idx.index
bayesian_val_idx = bayesian_val_idx.index



# Bounded region of parameter space
bounds_LGB = {
    'num_leaves': (31, 500), 
    'min_data_in_leaf': (20, 500),
    'bagging_fraction' : (0.1, 0.9),
    'feature_fraction' : (0.1, 0.9),
    'min_child_weight': (0.001, 0.1),   
    'reg_alpha': (0, 2), 
    'reg_lambda': (0, 2),
    'max_depth':(-1,50),
}

LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)

init_points = 10
n_iter = 15

print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)


LGB_BO.max["target"]



LGB_BO.max['params']



----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_da... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------
|  1        |  0.9369   |  0.3996   |  0.8606   |  36.33    |  0.06027  |  94.89    |  104.2    |  0.1162   |  1.732    |
|  2        |  0.9383   |  0.5809   |  0.6665   |  0.04981  |  0.09702  |  419.6    |  130.6    |  0.3636   |  0.3668   |
|  3        |  0.9306   |  0.3434   |  0.5198   |  21.03    |  0.02983  |  313.7    |  96.42    |  0.5843   |  0.7327   |
|  4        |  0.9157   |  0.4649   |  0.7281   |  9.183    |  0.05191  |  304.4    |  52.79    |  1.215    |  0.341    |
|  5        |  0.9304   |  0.152    |  0.8591   |  48.25    |  0.08103  |  166.2    |  76.81    |  1.368    |  0.8803   |
|  6        |  

{'bagging_fraction': 0.9,
 'feature_fraction': 0.9,
 'max_depth': 50.0,
 'min_child_weight': 0.09999999999449685,
 'min_data_in_leaf': 264.51564163645276,
 'num_leaves': 500.0,
 'reg_alpha': 0.0,
 'reg_lambda': 2.0}