In [1]:
# Reference - https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a

import os
import re
import json
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
import itertools
import csv
import pickle
import gc

from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score, classification_report
from tqdm import tqdm
from hyperopt import STATUS_OK, hp, tpe, Trials, fmin
from timeit import default_timer as timer

tqdm.pandas()

In [2]:
features = pd.read_csv('../input/grab-feature-stats/feature_stats.csv')
labels = pd.read_csv('../input/grab-safety/safety/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv')

labels.drop_duplicates(subset='bookingID', keep=False, inplace=True)
labels.reset_index(inplace=True, drop=True)

features.set_index('bookingID', inplace=True)
labels.set_index('bookingID', inplace=True)

data = pd.concat([features, labels], axis=1, join='inner')
col_names = list(data.columns[:-1])

X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

X[np.isnan(X)] = 0

del features, labels, data

gc.collect()

59

In [3]:
d_train = lgb.Dataset(X, label= y, feature_name=col_names)
d_train.save_binary('d_train.bin')
d_train = lgb.Dataset('../working/d_train.bin', feature_name=col_names)

In [4]:
def objective(params, n_folds = 5):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    
    global ITERATION
    
    ITERATION += 1
    start = timer()
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    cv_results = lgb.cv(params, 
                        d_train,                        # Training data
                        nfold = n_folds, 
                        num_boost_round = 2500, 
                        early_stopping_rounds = 50, 
                        metrics = 'auc', 
                        seed = 123, 
                        verbose_eval=100)
    
    run_time = timer() - start

    
    # Extract the best score
    best_score = max(cv_results['auc-mean'])     # error should be minimised
    
    # Loss must be minimized
    loss = - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)    # error should be minimised

    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])

    pickle.dump(bayes_trials, open("grab_safety_trials.p", "wb"))
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'estimators': n_estimators, 
            'train_time': run_time, 'status': STATUS_OK}


In [5]:
space = {
    'boosting_type': ['gbdt'],
    'objective':['binary'],
    'subsample' : hp.uniform('subsample', 0.5, 1),
    'num_leaves': hp.choice('num_leaves', np.arange(2, 10, dtype=int)),
    'max_depth' : hp.choice('max_depth', np.arange(2, 8, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
    'subsample_for_bin': hp.choice('subsample_for_bin', np.arange(2000, 14000, 1000, dtype=int)),
    'min_child_samples': hp.choice('min_child_samples',np.arange(20, 500, 5, dtype=int)),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'is_unbalance': hp.choice('is_unbalance', [True, False])
}

In [6]:
global  ITERATION
ITERATION = 0

tpe_algorithm = tpe.suggest
bayes_trials = Trials()
MAX_EVALS = 888

out_file = 'grab_safety_lgbm_trials.csv'

In [7]:
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = bayes_trials)

[100]	cv_agg's auc: 0.72592 + 0.00769454
[200]	cv_agg's auc: 0.730236 + 0.00855206
[300]	cv_agg's auc: 0.730486 + 0.00839582
[100]	cv_agg's auc: 0.700065 + 0.00609282
[200]	cv_agg's auc: 0.708848 + 0.00527962
[300]	cv_agg's auc: 0.714082 + 0.0054398
[400]	cv_agg's auc: 0.717242 + 0.00556945
[500]	cv_agg's auc: 0.719106 + 0.00600683
[600]	cv_agg's auc: 0.721024 + 0.00650475
[700]	cv_agg's auc: 0.722505 + 0.00695102
[800]	cv_agg's auc: 0.723678 + 0.0072471
[900]	cv_agg's auc: 0.724624 + 0.00743015
[1000]	cv_agg's auc: 0.725411 + 0.00768188
[1100]	cv_agg's auc: 0.726122 + 0.00787251
[1200]	cv_agg's auc: 0.726695 + 0.00793476
[1300]	cv_agg's auc: 0.727239 + 0.00799905
[1400]	cv_agg's auc: 0.72782 + 0.00797493
[1500]	cv_agg's auc: 0.728184 + 0.00809522
[1600]	cv_agg's auc: 0.728476 + 0.00813611
[1700]	cv_agg's auc: 0.728717 + 0.00819684
[1800]	cv_agg's auc: 0.729003 + 0.00821088
[1900]	cv_agg's auc: 0.729259 + 0.00815638
[2000]	cv_agg's auc: 0.729441 + 0.00825268
[2100]	cv_agg's auc: 0.7296