In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

n_folds = 5
MAX_EVALS = 10

In [2]:
features = pd.read_csv('C:/Users/Richard Cheung/Desktop/Personal Project/training_data2.csv')

features = features.sample(n = 30000, random_state = 42)

features = features.select_dtypes('number')

labels = np.array(features['target'].astype(np.int32)).reshape((-1, ))
features = features.drop(columns = ['target'])

train_features, test_features, train_labels, test_labels = train_test_split(features,
                                                                            labels,
                                                                            test_size = 10000,
                                                                            random_state = 50)

print('Training features shape: ', train_features.shape)
print('Testing features shape: ', test_features.shape)

Training features shape:  (20000, 127)
Testing features shape:  (10000, 127)


In [3]:
train_set = lgb.Dataset(data = train_features, label = train_labels)
test_set = lgb.Dataset(data = test_features, label = test_labels)

In [5]:
model = lgb.LGBMRegressor()
default_params = model.get_params()

del default_params['n_estimators']

cv_results = lgb.cv(default_params,
                    train_set, 
                    num_boost_round = 10000,
                    early_stopping_rounds = 100,
                    metrics = 'rmse',
                    nfold = n_folds,
                    seed = 42)

print('The maximum validation RMSE was: {:.5f} with a standard deviation of {:.5f}.'.format(cv_results['rmse-mean'][-1],
                                                                                           cv_results['rmse-stdv'][-1]))
print('The optimal number of boosting rounds (estimators) was {}.'.format(len(cv_results['rmse-mean'])))

The maximum validation RMSE was: 2481.05874 with a standard deviation of 525.89965.
The optimal number of boosting rounds (estimators) was 85.


In [6]:
from sklearn.metrics import mean_squared_error

In [8]:
model.n_estimators = len(cv_results['rmse-mean'])

model.fit(train_features, train_labels)
preds = model.predict(test_features)
baseline_rmse = np.sqrt(mean_squared_error(test_labels, preds))

print('The baseline model scores {:.5f} RMSE on the test set.'.format(baseline_rmse))

# This is the baseline score before hyperparameter tuning

The baseline model scores 2731.27568 RMSE on the test set.


In [9]:
def objective(hyperparameters, iteration):
    
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
        
    cv_results = lgb.cv(hyperparameters,
                        train_set,
                        num_boost_round = 10000, 
                        nfold = n_folds,
                        early_stopping_rounds = 100,
                        metrics = 'rmse',
                        seed = 42)
    
    score = cv_results['rmse-mean'][-1]
    estimators = len(cv_results['rmse-mean'])
    hyperparameters['n_estimators'] = estimators
    
    return [score, hyperparameters, iteration]

In [10]:
score, params, iteration = objective(default_params, 1)

print('The cross-validation RMSE was {:.5f}.'.format(score))

The cross-validation RMSE was 2481.05874.


In [11]:
model = lgb.LGBMModel()
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [12]:
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base= 10, num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
    'subsample': list(np.linspace(0.5, 1, 100)),
    'is_unbalance': [True, False]
}

In [13]:
import random

random.seed(50)

boosting_type = random.sample(param_grid['boosting_type'], 1)[0]

subsample = 1.0 if boosting_type =='goss' else random.sample(param_grid['subsample'], 1)[0]

print('boosting type: ', boosting_type)
print('Subsample ratio: ', subsample)

boosting type:  goss
Subsample ratio:  1.0


In [14]:
random_results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                              index = list(range(MAX_EVALS)))

grid_results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                            index = list(range(MAX_EVALS)))

In [15]:
import itertools

def grid_search(param_grid, max_evals = MAX_EVALS):
    
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                           index = list(range(max_evals)))
    
    keys, values = zip(*param_grid.items())
    
    i = 0
    
    for v in itertools.product(*values):
        
        hyperparameters = dict(zip(keys, v))
        
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
        
        i += 1
        
        if i > max_evals:
            break
            
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    
    return results

In [16]:
grid_results = grid_search(param_grid)

print('The best validation score was {:.5f}'.format(grid_results.loc[0, 'score']))
print('\nThe best hyperparameters were:')

import pprint
pprint.pprint(grid_results.loc[0, 'params'])

The best validation score was 2466.78647

The best hyperparameters were:
{'boosting_type': 'gbdt',
 'colsample_bytree': 0.6,
 'is_unbalance': True,
 'learning_rate': 0.004999999999999999,
 'metric': 'rmse',
 'min_child_samples': 20,
 'n_estimators': 4599,
 'num_leaves': 20,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 0.5,
 'subsample_for_bin': 20000,
 'verbose': 1}


In [18]:
grid_search_params = grid_results.loc[0, 'params']

model = lgb.LGBMRegressor(**grid_search_params, random_state = 42)
model.fit(train_features, train_labels)

preds = model.predict(test_features)

print('The best model from grid search scores {:.5f} MAE on the test set.'.format(np.sqrt(mean_squared_error(test_labels, preds))))

The best model from grid search scores 2711.55669 MAE on the test set.


In [19]:
random.seed(50)

random_params = {k: random.sample(v, 1) for k, v in param_grid.items()}

random_params['subsmaple'] = 1.0 if random_params['boosting_type'] == 'goss' else random_params['subsample']

random_params

{'boosting_type': ['goss'],
 'num_leaves': [88],
 'learning_rate': [0.027778881111994384],
 'subsample_for_bin': [220000],
 'min_child_samples': [175],
 'reg_alpha': [0.8979591836734693],
 'reg_lambda': [0.6122448979591836],
 'colsample_bytree': [0.8222222222222222],
 'subsample': [0.5505050505050505],
 'is_unbalance': [False],
 'subsmaple': [0.5505050505050505]}

In [21]:
def random_search(param_grid, max_evals = MAX_EVALS):
    
    results = pd.DataFrame(columns = ['score', 'params', 'iteration'],
                           index = list(range(max_evals)))
    
    for i in range(max_evals):
        
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']
        
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
        
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    return results

In [22]:
random_results = random_search(param_grid)

print('The best validation score was {:.5f}'.format(random_results.loc[0, 'score']))
print('\nThe best hyperparameters were:')

import pprint
pprint.pprint(random_results.loc[0, 'params'])

The best validation score was 3978.54230

The best hyperparameters were:
{'boosting_type': 'dart',
 'colsample_bytree': 0.6444444444444444,
 'is_unbalance': False,
 'learning_rate': 0.005116582891651224,
 'metric': 'rmse',
 'min_child_samples': 235,
 'n_estimators': 57,
 'num_leaves': 130,
 'reg_alpha': 0.3469387755102041,
 'reg_lambda': 0.5510204081632653,
 'subsample': 0.6262626262626263,
 'subsample_for_bin': 80000,
 'verbose': 1}


In [23]:
random_search_params = random_results.loc[0, 'params']

model = lgb.LGBMRegressor(**random_search_params, random_state = 42)
model.fit(train_features, train_labels)

preds = model.predict(test_features)

print('The best model from random search scores {:.5f} MAE on the test set.'.format(np.sqrt(mean_squared_error(test_labels, preds))))

The best model from random search scores 6110.27897 MAE on the test set.
