In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.preprocessing   import OneHotEncoder

from sklearn.metrics import roc_curve, roc_auc_score
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
import pandas as pd
import numpy as np
import IPython
import timeit


import warnings
#warnings.filterwarnings("ignore")

## Model

In [2]:
def report_1(model, data, y):
    preds = model.predict(data)    
    lr_auc = roc_auc_score(y, preds)
    print('Binary: ROC AUC=%.3f' % (lr_auc))
    return lr_auc
    
def report_2(model, data, y):
    preds = model.predict_proba(data)[:, -1]  
    lr_auc = roc_auc_score(y, preds)
    print('Binary: ROC AUC=%.3f' % (lr_auc))
    
    return lr_auc

In [3]:
def optimize_catboost(file, cat_feat):
    df = pd.read_csv('./Data/'+file+'.csv')
    for i in cat_feat:
        df[i] = df[i].astype(np.int) 
    df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(df.drop('label', axis=1),
                                                    df['label'], test_size=0.25)
    
    def cat_evaluate(max_depth, colsample_bytree, reg_lambda, learning_rate, iterations):   
        model = CatBoostClassifier(learning_rate = learning_rate,
                                   reg_lambda = reg_lambda,
                                   max_depth = int(max_depth),
                                   colsample_bylevel = colsample_bytree,
                                   thread_count=4, task_type='CPU',
                                   cat_features=cat_feat,
                                   verbose=False,
                                   iterations=int(iterations))

        
        
        
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)],
                  verbose_eval=1,
                  early_stopping_rounds=2000,
                  use_best_model=False,
                  plot=False)
        
        preds = model.predict(X_test)
        lr_auc = roc_auc_score(y_test, preds)

        return lr_auc

    cat_bo = BayesianOptimization(cat_evaluate, {'max_depth': (2, 14), 
                                                 'colsample_bytree': (0.01, 1),
                                                 'reg_lambda': (0, 100),
                                                 'learning_rate': (0.01, 1),
                                                 'iterations':(16, 1000)})

    cat_bo.maximize(init_points=8, n_iter=10, acq='ei')
    
    params = cat_bo.max['params']
    params['max_depth'] = int(params['max_depth'])
    
    return params, X_train, X_test, y_train, y_test

In [4]:
def train_cat(file, cat_feat):
    start = timeit.default_timer()
    print("Model Bayesian Optimization")
    params, X_train, X_test, y_train, y_test = optimize_catboost(file, cat_feat)

    kfold = 10
    skf = StratifiedKFold(n_splits=kfold, random_state=42)

    y = y_train.values
    X = X_train.values
    
    cat2=list(map(int, cat_feat))

    preds = []
    for fold_index, (train_index, test_index) in enumerate(skf.split(X, y)):
        print('[Fold %d/%d]' % (fold_index + 1, kfold))
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        
        model = CatBoostClassifier(learning_rate = params['learning_rate'],
                                   reg_lambda = params['reg_lambda'],
                                   max_depth = int(params['max_depth']),
                                   colsample_bylevel = params['colsample_bytree'],
                                   iterations=int(params['iterations']),
                                   thread_count=4, task_type='CPU',
                                   verbose=False,
                                   cat_features=cat2)
        
        lt=len(X_train)
        lv=len(X_valid)
        
        lstXT = list(X_train)
        lstVT = list(X_valid)
        lstTT = list(X_test)
        
        for j in range(lt):
            for i in cat2:
                lstXT2=list(lstXT[j-1])
                lstXT[j-1]=lstXT2
                lstXT[j-1][i]=lstXT[j-1][i].astype(str)
                
        print(type(lstVT))
        for j in range(lv):
            for i in cat2:
                lstVT2=list(lstVT[j-1])
                lstVT[j-1]=lstVT2
                lstVT[j-1][i]=lstVT[j-1][i].astype(str)
                
        model.fit(lstXT, y_train, eval_set=(lstVT, y_valid),
                  verbose_eval=False,
                  early_stopping_rounds=2000,
                  use_best_model=True,
                  plot=False)
    
    
    tt=len(X_test)
    lstTT = X_test.values.tolist()

    for k in range(tt):
        for m in cat2:
            lstTT[k-1][m]=str(lstTT[k-1][m])

    print("Model Train and Test AUC Report")
    auc = report_1(model, lstTT, y_test)

    stop = timeit.default_timer()
    time = stop - start
    print(f'Model Training Time: {time} s')  
    print(X_valid)
    return auc, time, params ###############################################################################################################################

In [5]:
def optimize_lgb(file, cat_feat):
    df = pd.read_csv('./Data/'+file+'.csv')
    for i in cat_feat:
        df[i] = df[i].astype(np.int)
    df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(df.drop('label', axis=1),
                                                    df['label'], test_size=0.25)
    train_data = lgb.Dataset(data=X_train, label=y_train, free_raw_data=False)
    
    def lgb_evaluate(max_depth, feature_fraction, min_split_gain, learning_rate, num_iterations):  
        params = {'metric': 'auc',
                  'max_depth': int(max_depth),
                  'feature_fraction': feature_fraction,
                  'min_split_gain':min_split_gain,
                  'learning_rate':learning_rate,
                  'num_iterations':int(num_iterations),
                  'boosting':'goss'}

        cv_result = lgb.cv(params, train_data, nfold=10, stratified=True, verbose_eval =False, metrics=['auc'],
                           categorical_feature=[cat_feat])
        return max(cv_result['auc-mean'])

    lgb_bo = BayesianOptimization(lgb_evaluate, {'max_depth': (2, 14), 
                                                 'feature_fraction': (0.01, 1),
                                                 'min_split_gain': (0, 10),
                                                 'learning_rate': (0.01, 1),
                                                 'num_iterations': (16, 1000)})

    lgb_bo.maximize(init_points=8, n_iter=10, acq='ei')
    
    params = lgb_bo.max['params']
    params['max_depth'] = int(params['max_depth'])
    
    return params, X_train, X_test, y_train, y_test

In [6]:
def train_lgb(file, cat_feat):
    start = timeit.default_timer()
    print("Model Bayesian Optimization")
    params, X_train, X_test, y_train, y_test = optimize_lgb(file, cat_feat)
    
    
    cat2=list(map(int, cat_feat))
    
        
    model = LGBMClassifier(
        learning_rate = params['learning_rate'],
        min_split_gain = params['min_split_gain'],
        max_depth = int(params['max_depth']),
        feature_fraction = params['feature_fraction'],
        num_iterations = int(params['num_iterations']),
        cat_features=cat2)
        
     
        
    model.fit(X_train, y_train , eval_metric= 'auc', 
              verbose= 1)
        


    print("Model Train and Test AUC Report")
    auc = report_1(model,X_test, y_test)
    
    stop = timeit.default_timer()
    time = stop - start
    print(f'Model Training Time: {time} s')  
    
    return auc, time, params ###############################################################################################################################

In [7]:
def train(versions, dataset, model, cat_feat):
    results = []
    for i in range(1, (versions+1)):
        temp = {}
        
        if model == 'xg':
            auc, time, params = train_xgboost(dataset+'_'+str(i))       ###################################################
        elif model == 'cat':
            auc, time, params = train_cat(dataset+'_'+str(i), cat_feat) ###################################################
        elif model == 'lgb':
            auc, time, params = train_lgb(dataset+'_'+str(i), cat_feat) ###################################################
        temp['AUC'] = auc
        temp['Dataset'] = dataset+'_'+str(i)
        temp['Time in sec'] = time

###############################################################################################################################
        for i in params.keys():
            temp['params '+i] = params[i]
###############################################################################################################################

        results.append(temp)
        

        IPython.display.clear_output() 
    print(results)
    results = pd.DataFrame(results)
    print(results)
    results['auc_mean'] = results['AUC'].mean()

###############################################################################################################################
    results['auc_std' ] = results['AUC']. std()
    for i in list(results):
        if 'params' in i:
            results[i+' mean'] = results[i].mean()
###############################################################################################################################

    results['time_mean'] = results['Time in sec'].mean()
    results.to_csv('./Results/'+dataset+'_'+model+'R1test.csv')

## A1_100 Train

In [8]:
#train(20, 'A1_100', 'xg',[])

In [9]:
#train(20, 'A1_100', 'cat', [])

In [10]:
#train(20, 'A1_100', 'lgb', [])

## A1_1000 Train

In [11]:
#train(20, 'A1_1000', 'xg',[])

In [12]:
#train(20, 'A1_1000', 'cat', [])

In [13]:
#train(20, 'A1_1000', 'lgb', [])

## A1_10000 Train

In [14]:
#train(20, 'A1_10000', 'xg',[])

In [15]:
#train(20, 'A1_10000', 'cat', [])

In [16]:
#train(20, 'A1_10000', 'lgb', [])

## A2_100 Train

In [17]:
#train(20, 'A2_100', 'xg',[])

In [18]:
#train(20, 'A2_100', 'cat',[])

In [19]:
#train(20, 'A2_100', 'lgb',[])

## A2_1000 Train

In [20]:
#train(20, 'A2_1000', 'xg',[])

In [21]:
#train(20, 'A2_1000', 'cat',[])

In [22]:
#train(20, 'A2_1000', 'lgb',[])

## A2_10000 Train

In [23]:
#train(20, 'A2_10000', 'xg',[])

In [24]:
#train(3, 'A2_10000', 'cat',[])

In [25]:
#train(3, 'A2_10000', 'lgb',[])

## A3_100 Train

In [26]:
#train(3, 'A3_100', 'xg',[])

In [27]:
#train(3, 'A3_100', 'cat',[])

In [28]:
#train(3, 'A3_100', 'lgb',[])

## A3_1000 Train

In [29]:
#train(3, 'A3_1000', 'xg',[])

In [30]:
#train(3, 'A3_1000', 'cat',[])

In [31]:
#train(3, 'A3_1000', 'lgb',[])

## A3_10000 Train

In [32]:
#train(3, 'A3_10000', 'xg',[])

In [33]:
#train(3, 'A3_10000', 'cat',[])

In [34]:
#train(3, 'A3_10000', 'lgb',[])

## B1_100 Train

In [35]:
#train(20, 'B1_100', 'xg',['6', '7'])

In [36]:
#train(20, 'B1_100', 'cat',['6', '7'])

In [37]:
#train(20, 'B1_100', 'lgb',['6', '7'])

## B1_1000 Train

In [38]:
#train(20, 'B1_1000', 'xg',['6', '7'])

In [39]:
#train(20, 'B1_1000', 'cat',['6', '7'])

In [40]:
#train(20, 'B1_1000', 'lgb',['6', '7'])

## B1_10000 Train

In [41]:
#train(20, 'B1_10000', 'xg',['6', '7'])

In [42]:
#train(20, 'B1_10000', 'cat',['6', '7'])

In [43]:
#train(20, 'B1_10000', 'lgb',['6', '7'])

## B1S_100

In [44]:
#train(20, 'B1S_100', 'xg',['6', '7'])

In [45]:
#train(20, 'B1S_100', 'cat',['6', '7'])

In [46]:
#train(20, 'B1S_100', 'lgb',['6', '7'])

## B1S_1000

In [47]:
#train(20, 'B1S_1000', 'xg',['6', '7'])

In [48]:
#train(20, 'B1S_1000', 'cat',['6', '7'])

In [49]:
#train(20, 'B1S_1000', 'lgb',['6', '7'])

## B1S_10000

In [50]:
#train(20, 'B1S_10000', 'xg',['6', '7'])

In [51]:
#train(20, 'B1S_10000', 'cat',['6', '7'])

In [52]:
#train(20, 'B1S_10000', 'lgb',['6', '7'])

## B2_100 Train

In [53]:
#train(3, 'B2_100', 'xg',['6', '7'])

In [54]:
#train(3, 'B2_100', 'cat',['6', '7'])

In [55]:
#train(3, 'B2_100', 'lgb',['6', '7'])

## B2_1000 Train

In [56]:
#train(3, 'B2_1000', 'xg',['6', '7'])

In [57]:
#train(3, 'B2_1000', 'cat',['6', '7'])

In [58]:
#train(3, 'B2_1000', 'lgb',['6', '7'])

## B2_10000 Train

In [59]:
#train(3, 'B2_10000', 'xg',['6', '7'])

In [60]:
#train(3, 'B2_10000', 'cat',['6', '7'])

In [61]:
#train(3, 'B2_10000', 'lgb',['6', '7'])

## B3_100 Train

In [62]:
#train(3, 'B3_100', 'xg',['6', '7'])

In [63]:
#train(3, 'B3_100', 'cat',['6', '7'])

In [64]:
#train(3, 'B3_100', 'lgb',['6', '7'])

## B3_1000 Train

In [65]:
#train(3, 'B3_1000', 'xg',['6', '7'])

In [66]:
#train(3, 'B3_1000', 'cat',['6', '7'])

In [67]:
#train(3, 'B3_1000', 'lgb',['6', '7'])

## B3_10000 Train

In [68]:
#train(3, 'B3_10000', 'xg',['6', '7'])

In [69]:
#train(3, 'B3_10000', 'cat',['6', '7'])

In [70]:
#train(3, 'B3_10000', 'lgb',['6', '7'])

## C1_100 train

In [71]:
#train(20, 'C1_100', 'xg',['4', '5', '6', '7'])

In [72]:
#train(20, 'C1_100', 'cat',['4', '5', '6', '7'])

In [73]:
#train(20, 'C1_100', 'lgb',['4', '5', '6', '7'])

## C1_1000 Train

In [74]:
#train(20, 'C1_1000', 'xg',['4', '5', '6', '7'])

In [75]:
#train(20, 'C1_1000', 'cat',['4', '5', '6', '7'])

In [76]:
#train(20, 'C1_1000', 'lgb',['4', '5', '6', '7'])

## C1_10000 Train

In [77]:
#train(20, 'C1_10000', 'xg',['4', '5', '6', '7'])

In [78]:
#train(20, 'C1_10000', 'cat',['4', '5', '6', '7'])

In [79]:
#train(20, 'C1_10000', 'lgb',['4', '5', '6', '7'])

## C1S_100

In [80]:
#train(20, 'C1S_100', 'xg',['4', '5', '6', '7'])

In [81]:
#train(20, 'C1S_100', 'cat',['4', '5', '6', '7'])

In [82]:
#train(20, 'C1S_100', 'lgb',['4', '5', '6', '7'])

## C1S_1000

In [83]:
#train(20, 'C1S_1000', 'xg',['4', '5', '6', '7'])

In [84]:
#train(20, 'C1S_1000', 'cat',['4', '5', '6', '7'])

In [85]:
#train(20, 'C1S_1000', 'lgb',['4', '5', '6', '7'])

## C1S_10000

In [86]:
#train(20, 'C1S_10000', 'xg',['4', '5', '6', '7'])

In [87]:
#train(20, 'C1S_10000', 'cat',['4', '5', '6', '7'])

In [88]:
#train(20, 'C1S_10000', 'lgb',['4', '5', '6', '7'])

## C2_100 Train


In [89]:
#train(3, 'C2_100', 'xg',['4', '5', '6', '7'])

In [90]:
#train(3, 'C2_100', 'cat',['4', '5', '6', '7'])

In [91]:
#train(3, 'C2_100', 'lgb',['4', '5', '6', '7'])

## C2_1000 Train

In [92]:
#train(3, 'C2_1000', 'xg',['4', '5', '6', '7'])

In [93]:
#train(3, 'C2_1000', 'cat',['4', '5', '6', '7'])

In [94]:
#train(3, 'C2_1000', 'lgb',['4', '5', '6', '7'])

## C2_10000 Train

In [95]:
#train(3, 'C2_10000', 'xg',['4', '5', '6', '7'])

In [96]:
#train(3, 'C2_10000', 'cat',['4', '5', '6', '7'])

In [97]:
#train(3, 'C2_10000', 'lgb',['4', '5', '6', '7'])

## C3_100 Train

In [98]:
#train(3, 'C3_100', 'xg',['4', '5', '6', '7'])

In [99]:
#train(3, 'C3_100', 'cat',['4', '5', '6', '7'])

In [100]:
#train(3, 'C3_100', 'lgb',['4', '5', '6', '7'])

## C3_1000 Train

In [101]:
#train(3, 'C3_1000', 'xg',['4', '5', '6', '7'])

In [102]:
#train(3, 'C3_1000', 'cat',['4', '5', '6', '7'])

In [103]:
#train(3, 'C3_1000', 'lgb',['4', '5', '6', '7'])

## C3_10000 Train

In [104]:
#train(3, 'C3_10000', 'xg',['4', '5', '6', '7'])

In [105]:
#train(3, 'C3_10000', 'cat',['4', '5', '6', '7'])

In [106]:
#train(3, 'C3_10000', 'lgb',['4', '5', '6', '7'])