In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from datetime import datetime
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from scipy.stats import randint as sp_randint
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [5]:
def average(df):
    sum = 0
    count = 0
    for x in df['f1']:
        if x!=-1:
            sum+=x
            count+=1
    avg = sum/count
    return avg


In [6]:
def results(model, X_train, X_valid,y_train, y_valid):
    soft_probs = model.predict_proba(X_valid)
    valid_auc = roc_auc_score(y_valid, soft_probs[:,1])
    print(valid_auc)
    train_preds = model.predict(X_train)
    acc = accuracy_score(y_train, train_preds)
    print(acc)
    preds = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    print(acc)
    print(classification_report(y_valid,preds))

In [7]:
df = pd.read_csv("train_final.csv")
df.describe()
f1_avg = average(df)
df['f1'].replace(-1,f1_avg, inplace = True)
df.describe()

Unnamed: 0,Id,Y,f1,f2,f3,f4,f5,f6,f7,f8,...,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24
count,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,...,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0,16383.0
mean,8192.0,0.942135,43031.41572,1.044375,11.770938,118323.581456,1.044436,0.050052,117089.674113,169730.1786,...,25894.316914,119045.099005,184622.040835,1.047305,125959.667765,1.044558,1.045718,1.041934,32718.9,1.043948
std,4729.509065,0.233495,33596.053696,0.264806,353.187115,4518.059755,0.265601,0.293892,10261.29297,69396.677853,...,36086.993946,18321.987129,100590.811845,0.306239,31091.344158,0.262576,0.266874,0.246597,3184929.0,0.25964
min,1.0,0.0,37.0,1.0,1.77,23779.0,1.0,0.0,4292.0,4673.0,...,25.0,4674.0,3130.0,1.0,117879.0,1.0,1.0,1.0,1.0,1.0
25%,4096.5,1.0,20331.0,1.0,1.77,118096.0,1.0,0.0,117961.0,117906.0,...,4554.0,118395.0,118398.0,1.0,118274.0,1.0,1.0,1.0,1.0,1.0
50%,8192.0,1.0,35530.0,1.0,1.77,118300.0,1.0,0.0,117961.0,128130.0,...,13234.0,118929.0,119095.0,1.0,118568.0,1.0,1.0,1.0,2.0,1.0
75%,12287.5,1.0,74240.5,1.0,3.54,118386.0,1.0,0.0,117961.0,234498.5,...,38902.0,120539.0,290919.0,1.0,120006.0,1.0,1.0,1.0,9.0,1.0
max,16383.0,1.0,312152.0,7.0,43910.16,286791.0,9.0,10.0,311178.0,311867.0,...,311696.0,286792.0,308574.0,18.0,311867.0,8.0,8.0,7.0,404288600.0,8.0


In [8]:
y = df.loc[:,'Y']
X = df.loc[:,'f1':'f24']
X['f3']=100*X['f3']
X['f3'] = X['f3'].astype('int')
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size = 0.75, test_size = 0.25,random_state = 42, shuffle = True)

In [9]:
from catboost import CatBoostClassifier, Pool

In [12]:
# cat_indices = [1,3,4,5,6,7,8,9,10,12,14,15,16,17,18,19,20,21,22,23]
cat_indices = [1,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,22,23]
model_cat = CatBoostClassifier(cat_features=cat_indices,
                           n_estimators=680,                         
                           learning_rate =0.0169,
                           random_state = 42,
                           subsample = 0.66322,
                           early_stopping_rounds = 30,
                           max_depth = 8,
                           one_hot_max_size = 10,
                           verbose=100,
                            bagging_temperature = 1.0,
#                            scale_pos_weight = 0.05785,
                           eval_metric='AUC')
model_cat.fit(X_train,y_train)
results(model_cat,X_train, X_valid, y_train, y_valid)

0:	total: 74.7ms	remaining: 50.7s
100:	total: 5.3s	remaining: 30.4s
200:	total: 13.2s	remaining: 31.5s
300:	total: 20.8s	remaining: 26.2s
400:	total: 30.8s	remaining: 21.4s
500:	total: 38.5s	remaining: 13.8s
600:	total: 45.1s	remaining: 5.93s
679:	total: 49.4s	remaining: 0us
0.9005485813902007
0.9724912509156018
0.96484375
              precision    recall  f1-score   support

           0       0.90      0.40      0.56       224
           1       0.97      1.00      0.98      3872

    accuracy                           0.96      4096
   macro avg       0.93      0.70      0.77      4096
weighted avg       0.96      0.96      0.96      4096



## Run Bayesian search CV

In [34]:
params ={    'n_estimators':Integer(700,1500),
             'learning_rate':Real(0.01,1.0,'log-uniform'),
             'subsample': Real(0.4,0.8),
             'max_depth': Integer(5,9),
            'colsample_bylevel': Real(0.1, 0.8),
           'bagging_temperature':Real(0.0, 1.0),
         'l2_leaf_reg':Integer(2,30),
         'random_strength': Real(1e-9, 10, 'log-uniform'),
        }

In [35]:
skf = StratifiedKFold(n_splits=3, shuffle = True, random_state=42)

In [36]:
cat_indices = [1,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,20,21,22,23]
model_cat_search = CatBoostClassifier(cat_features=cat_indices,
#                            n_estimators=680,                         
#                            learning_rate =0.0169,
                           random_state = 42,
#                            subsample = 0.66322,
                           early_stopping_rounds = 30,
#                            max_depth = 8,
                           one_hot_max_size = 10,
                           verbose=100,
#                             bagging_temperature = 1.0,
#                            scale_pos_weight = 0.05785,
                           eval_metric='AUC')

In [37]:
bayes_cv_tuner = BayesSearchCV(estimator = model_cat_search, scoring='roc_auc',
                              search_spaces = params, cv = skf, n_jobs=-1,n_iter=100,
                              refit=True, random_state=42)

In [38]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""

    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    

    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))

In [39]:
results_bayes = bayes_cv_tuner.fit(X_train, y_train, callback=status_print)

Model #1
Best ROC-AUC: 0.8983
Best params: OrderedDict([('bagging_temperature', 0.41010395885331385), ('colsample_bylevel', 0.6094080202241275), ('l2_leaf_reg', 28), ('learning_rate', 0.042815319280763466), ('max_depth', 8), ('n_estimators', 1031), ('random_strength', 3.230824361824754e-06), ('subsample', 0.6958016936761682)])

Model #2
Best ROC-AUC: 0.8983
Best params: OrderedDict([('bagging_temperature', 0.41010395885331385), ('colsample_bylevel', 0.6094080202241275), ('l2_leaf_reg', 28), ('learning_rate', 0.042815319280763466), ('max_depth', 8), ('n_estimators', 1031), ('random_strength', 3.230824361824754e-06), ('subsample', 0.6958016936761682)])

Model #3
Best ROC-AUC: 0.8983
Best params: OrderedDict([('bagging_temperature', 0.41010395885331385), ('colsample_bylevel', 0.6094080202241275), ('l2_leaf_reg', 28), ('learning_rate', 0.042815319280763466), ('max_depth', 8), ('n_estimators', 1031), ('random_strength', 3.230824361824754e-06), ('subsample', 0.6958016936761682)])

Model #4
B

KeyboardInterrupt: 

Based on 40 iteration performances on colab gpu training, bayes search cv isnt much of a help. Move on to Optuna/hyperopt