In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier
from catboost import Pool

from bayes_opt import BayesianOptimization

In [None]:
# read data
train = pd.read_csv('./data/train.csv',index_col='id')
test = pd.read_csv('./data/test.csv',index_col='id')

In [None]:
def trainCatBoost(depth = 7, rate = 0.03, l2 = 3):
    nFolds = 6
    
    skf = StratifiedKFold(n_splits= nFolds, shuffle=True, random_state=42)

    dirName = '/tmp/porto/catboost/'+str(depth)+'_'+str(rate)+\
               '_'+str(l2)+'_'+str(1)+'_'+str(nFolds)
    try:
        os.mkdir(dirName)
    except Exception:
        print Exception

    modelList = list()
    scoreList = list()
    seed = 1
    
    model = CatBoostClassifier(verbose= False, iterations=1000, thread_count=4, eval_metric= "AUC",
                                   depth= np.round(depth), learning_rate= rate,
                                   l2_leaf_reg= l2, od_type = 'Iter', od_wait = 20)

    for train_index, val_index in skf.split(X_train, y_train):

        model.set_params(train_dir = dirName+str(seed), random_seed= seed)

        trainPool = Pool(trainSet.iloc[train_index], targetSet.iloc[train_index], feature_names=trainSet.columns.tolist())
        valPool = Pool(trainSet.iloc[val_index], targetSet.iloc[val_index], feature_names=trainSet.columns.tolist()) 
        model.fit(trainPool, eval_set=valPool, use_best_model = True)
        
        prob = model.predict_proba(valPool)[:,1]
        
        score =  roc_auc_score(targetSet.iloc[val_index],prob)

        modelList.append(model)
        scoreList.append(score)
        seed+=1
        print 'Tree amount is ', model.tree_count_ ,' ROC_AUC is', score
    
    scoreArr = np.array(scoreList)
    return scoreArr.mean() 

In [None]:
def dropCalcFeatures(df): 
    dropList = list()
    for fName in df.columns.tolist():
        if fName.find('_calc_')>(-1):
            dropList.append(fName)
    
    return df.drop(dropList,axis = 1)

In [None]:
X_train = dropCalcFeatures(train.drop('target',axis = 1))
y_train = train.target

In [None]:
xgbBO = BayesianOptimization(trainCatBoost, {'depth': (6, 10),
                                            'rate': (0.01, 0.1),
                                            'l2': (1, 10)})

In [None]:
xgbBO.maximize(init_points=2, n_iter=3, acq="poi", xi=0.1)