In [1]:
%%time
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.size'] = 18
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

n_folds = 5
MAX_EVALS= 100

Wall time: 2.18 s


In [2]:
features = pd.read_csv('C:/Users/Richard Cheung/Desktop/Personal Project/training_data2.csv')

features = features.sample(n = 100000, random_state = 42)

features = features.select_dtypes('number')

labels = np.array(features['target'].astype(np.int32)).reshape((-1, ))
features = features.drop(columns = ['target'])

train_features, test_features, train_labels, test_labels = train_test_split(features,
                                                                            labels,
                                                                            test_size = 30000,
                                                                            random_state = 50)

print('Training features shape: ', train_features.shape)
print('Testing features shape: ', test_features.shape)

Training features shape:  (70000, 127)
Testing features shape:  (30000, 127)


In [3]:
model = lgb.LGBMRegressor(random_state = 50)

train_set = lgb.Dataset(data = train_features, label = train_labels)
test_set = lgb.Dataset(data = test_features, label = test_labels)

In [4]:
hyperparameters = model.get_params()

del hyperparameters['n_estimators']

cv_results = lgb.cv(hyperparameters,
                    train_set, 
                    num_boost_round = 10000,
                    early_stopping_rounds = 100,
                    metrics = 'rmse',
                    verbose_eval = False,
                    nfold = n_folds,
                    seed = 42)

best = cv_results['rmse-mean'][-1]

best_std = cv_results['rmse-stdv'][-1]

print('The maximum validation RMSE was: {:.5f} with a standard deviation of {:.5f}.'.format(best, best_std))
print('The optimal number of boosting rounds (estimators) was {}.'.format(len(cv_results['rmse-mean'])))

The maximum validation RMSE was: 2601.02069 with a standard deviation of 204.35216.
The optimal number of boosting rounds (estimators) was 519.


In [5]:
model.n_estimators = len(cv_results['rmse-mean'])

model.fit(train_features, train_labels)
preds = model.predict(test_features)
baseline_rmse = np.sqrt(mean_squared_error(test_labels, preds))

print('The baseline model scores {:.5f} RMSE on the test set.'.format(baseline_rmse))

# This is the baseline score before hyperparameter tuning

The baseline model scores 2599.90545 RMSE on the test set.


In [6]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

def objective(hyperparameters):
    
    global ITERATION
    
    ITERATION += 1
    
    if 'n_estimators' in hyperparameters:
        del hyperparameters['n_estimators']
        
    subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
    
    hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
    hyperparameters['subsample'] = subsample
    
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])
        
    start = timer()
    
    cv_results = lgb.cv(hyperparameters, 
                        train_set, 
                        num_boost_round = 10000,
                        nfold = n_folds,
                        early_stopping_rounds = 100,
                        metrics = 'rmse',
                        seed = 50)
    
    run_time = timer() - start
    
    best_score = cv_results['rmse-mean'][-1]
    
    n_estimators = len(cv_results['rmse-mean'])
    
    hyperparameters['n_estimators'] = n_estimators
    
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([best_score, hyperparameters, ITERATION, run_time])
    of_connection.close()
    
    return {'loss': best_score, 
            'hyperparameters': hyperparameters, 
            'iteration': ITERATION,
            'train_time': run_time,
            'status': STATUS_OK}

In [7]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

In [8]:
boosting_type = {'boosting_type': hp.choice('boosting_type',
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('subsample', 0.5, 1)},
                                             {'boosting_type': 'dart', 'subsample': hp.uniform('subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}])}

hyperparams = sample(boosting_type)
hyperparams

{'boosting_type': {'boosting_type': 'goss', 'subsample': 1.0}}

In [9]:
space = {'boosting_type': hp.choice('boosting_type',
                                    [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)},
                                     {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                     {'boosting_type': 'goss', 'subsample': 1.0}]),
         'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
         'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
         'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
         'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
         'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
         'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
         'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
         'is_unbalance': hp.choice('is_unbalance', [True, False])}

In [10]:
%%time

OUT_FILE = 'bayes_test2.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

ITERATION = 0

headers = ['loss', 'hyperparameters', 'iteration', 'runtime']
writer.writerow(headers)
of_connection.close()

results = objective(sample(space))
print('The cross validation score = {:.5f}.'.format(results['loss']))
print('The optimal number of estimators was {}.'.format(results['hyperparameters']['n_estimators']))

The cross validation score = 2582.59087.
The optimal number of estimators was 2253.
Wall time: 9min 37s


In [11]:
from hyperopt import tpe
from hyperopt import Trials

tpe_algorithm = tpe.suggest
trials = Trials()

In [12]:
OUT_FILE = 'bayes_test2.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

ITERATION = 0

headers = ['loss', 'hyperparameters', 'iteration', 'runtime']
writer.writerow(headers)
of_connection.close()

In [15]:
%%time

from hyperopt import fmin

global ITERATION

ITERATION = 0

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            trials = trials,
            max_evals = MAX_EVALS)

best

Wall time: 5h 39min 12s


In [16]:
trials_dict = sorted(trials.results, key = lambda x: x['loss'])
trials_dict[:1]

[{'loss': 2577.860143720671,
  'hyperparameters': {'boosting_type': 'gbdt',
   'colsample_bytree': 0.6780349373400474,
   'is_unbalance': True,
   'learning_rate': 0.010388490944056066,
   'min_child_samples': 55,
   'num_leaves': 114,
   'reg_alpha': 0.05611822193404954,
   'reg_lambda': 0.9931142101637381,
   'subsample_for_bin': 200000,
   'subsample': 0.5006771377323579,
   'metric': 'rmse',
   'verbose': 1,
   'n_estimators': 2422},
  'iteration': 73,
  'train_time': 484.2325275866169,
  'status': 'ok'}]

In [17]:
results = pd.read_csv(OUT_FILE)

In [18]:
import ast

def evaluate(results, name):
    
    new_results = results.copy()
    
    new_results['hyperparameters'] = new_results['hyperparameters'].map(ast.literal_eval)
    
    new_results = new_results.sort_values('loss', ascending = True).reset_index(drop = True)
    
    print('The lowest RMSE value from {} was {:.5f} found on iteration {}.'.format(name, new_results.loc[0, 'loss'], new_results.loc[0, 'iteration']))
    
    hyperparameters = new_results.loc[0, 'hyperparameters']
    model = lgb.LGBMRegressor(**hyperparameters)
    
    model.fit(train_features, train_labels)

    preds = model.predict(test_features)
    
    print('RMSE score from {} on test data = {:.5f}.'.format(name, np.sqrt(mean_squared_error(test_labels, preds))))
    
    hyp_df = pd.DataFrame(columns = list(new_results.loc[0, 'hyperparameters'].keys()))
    
    for i, hyp in enumerate(new_results['hyperparameters']):
        hyp_df = hyp_df.append(pd.DataFrame(hyp, index = [0]),
                               ignore_index = True)
        
    hyp_df['iteration'] = new_results['iteration']
    hyp_df['score'] = new_results['loss']
    
    return hyp_df

In [19]:
%%time

bayes_results = evaluate(results, name = 'Bayesian')
bayes_results

The lowest RMSE value from Bayesian was 2577.86014 found on iteration 73.
RMSE score from Bayesian on test data = 2573.11412.
Wall time: 2min 14s
