In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna, joblib

In [3]:
sc = StandardScaler()

In [4]:
### create an objective function 
def objective(
    trial #this is the ONLY arguement is default and a must
    ):
    
    #get data
    df_train = pd.read_csv('train_data.csv')
    df_test = pd.read_csv('test_data.csv')
              
    #separate X and y
    X_train, y_train = df_train.iloc[:,:-1], df_train.iloc[:,-1]
    X_test, y_test = df_test.iloc[:,:-1], df_test.iloc[:,-1]
    
    X_train_sc = sc.fit_transform(X_train)
    X_test_sc = sc.transform(X_test)
    
    dtrain = lgb.Dataset(X_train_sc, label = y_train)
    dvalid = lgb.Dataset(X_test_sc, label = y_test)
    
    # in optuna, setting of parameters is as follows:
    '''
    dictionary = {
    hyperparameter_alias: trial.suggest_distribution(
                           'hyperparameter_alias',
                           ['distribution values]'
                           )
    }    
    
    distribution to choose from:
    
    uniform — float values
    loguniform — float values
    discrete_uniform — float values with intervals
    int — integer values
    categorical — categorical values from a list
    '''
    
    #example:
    params = {
        'task': 'train',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'rf']),
        'metric': 'rmse',
        'objective': trial.suggest_categorical('objective', ['regression_l1', 'regression_l2']),
        'verbosity': -1,
        "seed": 42,
        "learning_rate": trial.suggest_loguniform('learning_rate', 0.05, 1),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }        

    model = lgb.train(
            params, 
            dtrain, 
            valid_names = ['eval', 'train'], 
            valid_sets = [dvalid, dtrain],
            early_stopping_rounds = 500,
            num_boost_round  = 10000,
            )

    joblib.dump(model, 'OPTIMIZED_MODEL.sav') #save model for future use

    prediction = model.predict(sc.transform(X_test))
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    return rmse

In [5]:
#optimize hyper param
study = optuna.create_study(direction = 'minimize') # minimize for regression, maximize for classification
study.optimize(objective, n_trials = 5)

[32m[I 2021-03-03 13:43:03,578][0m A new study created in memory with name: no-name-8b731ed8-d452-4bbe-8a3e-1bfcf5056f26[0m


[1]	train's rmse: 4.30492	eval's rmse: 4.25264
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 3.79221	eval's rmse: 3.7
[3]	train's rmse: 3.7318	eval's rmse: 3.69399
[4]	train's rmse: 3.65663	eval's rmse: 3.53859
[5]	train's rmse: 3.63552	eval's rmse: 3.55547
[6]	train's rmse: 3.55685	eval's rmse: 3.42931
[7]	train's rmse: 3.51073	eval's rmse: 3.44739
[8]	train's rmse: 3.48488	eval's rmse: 3.47028
[9]	train's rmse: 3.47018	eval's rmse: 3.46382
[10]	train's rmse: 3.38719	eval's rmse: 3.38054
[11]	train's rmse: 3.36519	eval's rmse: 3.37208
[12]	train's rmse: 3.3464	eval's rmse: 3.3619
[13]	train's rmse: 3.35035	eval's rmse: 3.38747
[14]	train's rmse: 3.36689	eval's rmse: 3.38354
[15]	train's rmse: 3.3328	eval's rmse: 3.37873
[16]	train's rmse: 3.33676	eval's rmse: 3.41183
[17]	train's rmse: 3.30101	eval's rmse: 3.38945
[18]	train's rmse: 3.27781	eval's rmse: 3.35588
[19]	train's rmse: 3.27855	eval's rmse: 3.36561
[20]	train's rmse: 3.2913	eval's rmse: 3.37

[32m[I 2021-03-03 13:43:05,774][0m Trial 0 finished with value: 3.11394714926405 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l1', 'learning_rate': 0.3608007518402737, 'lambda_l1': 4.573331878241756e-06, 'lambda_l2': 0.05043955135391725, 'num_leaves': 182, 'feature_fraction': 0.20608213089466954, 'bagging_fraction': 0.20571599131246973, 'bagging_freq': 1, 'min_child_samples': 15}. Best is trial 0 with value: 3.11394714926405.[0m



[509]	train's rmse: 2.27375	eval's rmse: 3.29917
[510]	train's rmse: 2.27679	eval's rmse: 3.30273
[511]	train's rmse: 2.27432	eval's rmse: 3.27531
[512]	train's rmse: 2.27456	eval's rmse: 3.27779
[513]	train's rmse: 2.28156	eval's rmse: 3.27844
[514]	train's rmse: 2.2881	eval's rmse: 3.28523
[515]	train's rmse: 2.29171	eval's rmse: 3.30481
[516]	train's rmse: 2.29758	eval's rmse: 3.3085
[517]	train's rmse: 2.2943	eval's rmse: 3.29518
[518]	train's rmse: 2.28362	eval's rmse: 3.28439
[519]	train's rmse: 2.28108	eval's rmse: 3.27806
[520]	train's rmse: 2.28115	eval's rmse: 3.29097
[521]	train's rmse: 2.26853	eval's rmse: 3.30302
[522]	train's rmse: 2.26523	eval's rmse: 3.30486
[523]	train's rmse: 2.26115	eval's rmse: 3.29785
[524]	train's rmse: 2.25089	eval's rmse: 3.2805
[525]	train's rmse: 2.2491	eval's rmse: 3.27443
[526]	train's rmse: 2.2495	eval's rmse: 3.27329
[527]	train's rmse: 2.25073	eval's rmse: 3.27326
[528]	train's rmse: 2.24801	eval's rmse: 3.2771
[529]	train's rmse: 2.2483

[32m[I 2021-03-03 13:43:07,859][0m Trial 1 finished with value: 3.000848352306681 and parameters: {'boosting_type': 'rf', 'objective': 'regression_l2', 'learning_rate': 0.4935403497816631, 'lambda_l1': 1.4086862519593023e-08, 'lambda_l2': 0.00199419640664809, 'num_leaves': 233, 'feature_fraction': 0.45231156757587954, 'bagging_fraction': 0.8465408718524889, 'bagging_freq': 8, 'min_child_samples': 99}. Best is trial 1 with value: 3.000848352306681.[0m


[1]	train's rmse: 4.40007	eval's rmse: 4.33013
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 3.95065	eval's rmse: 3.90706
[3]	train's rmse: 3.69232	eval's rmse: 3.66591
[4]	train's rmse: 3.4726	eval's rmse: 3.4858
[5]	train's rmse: 3.26768	eval's rmse: 3.28706
[6]	train's rmse: 3.14059	eval's rmse: 3.17001
[7]	train's rmse: 3.0696	eval's rmse: 3.11792
[8]	train's rmse: 2.99295	eval's rmse: 3.04926
[9]	train's rmse: 2.92463	eval's rmse: 2.98873
[10]	train's rmse: 2.86345	eval's rmse: 2.9265
[11]	train's rmse: 2.83968	eval's rmse: 2.90442
[12]	train's rmse: 2.79307	eval's rmse: 2.85525
[13]	train's rmse: 2.76971	eval's rmse: 2.83487
[14]	train's rmse: 2.74666	eval's rmse: 2.82185
[15]	train's rmse: 2.72948	eval's rmse: 2.81206
[16]	train's rmse: 2.70103	eval's rmse: 2.77836
[17]	train's rmse: 2.68277	eval's rmse: 2.7653
[18]	train's rmse: 2.65217	eval's rmse: 2.73099
[19]	train's rmse: 2.63052	eval's rmse: 2.71798
[20]	train's rmse: 2.6044	eval's rmse: 2

[32m[I 2021-03-03 13:43:09,127][0m Trial 2 finished with value: 2.162389997711112 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l2', 'learning_rate': 0.21460539214075433, 'lambda_l1': 0.0005323187413301932, 'lambda_l2': 0.0007250248525641071, 'num_leaves': 201, 'feature_fraction': 0.4798157817124993, 'bagging_fraction': 0.718636691749577, 'bagging_freq': 16, 'min_child_samples': 93}. Best is trial 2 with value: 2.162389997711112.[0m


[1005]	train's rmse: 1.20315	eval's rmse: 2.19684
[1006]	train's rmse: 1.20296	eval's rmse: 2.19733
[1007]	train's rmse: 1.20281	eval's rmse: 2.19687
[1008]	train's rmse: 1.20211	eval's rmse: 2.19734
[1009]	train's rmse: 1.20017	eval's rmse: 2.20006
[1010]	train's rmse: 1.19882	eval's rmse: 2.20209
[1011]	train's rmse: 1.19813	eval's rmse: 2.20315
[1012]	train's rmse: 1.19807	eval's rmse: 2.20425
[1013]	train's rmse: 1.19738	eval's rmse: 2.20603
[1014]	train's rmse: 1.19706	eval's rmse: 2.2078
[1015]	train's rmse: 1.19707	eval's rmse: 2.20897
[1016]	train's rmse: 1.19716	eval's rmse: 2.20992
[1017]	train's rmse: 1.19676	eval's rmse: 2.21052
[1018]	train's rmse: 1.19632	eval's rmse: 2.21156
[1019]	train's rmse: 1.19629	eval's rmse: 2.21306
[1020]	train's rmse: 1.19597	eval's rmse: 2.21409
[1021]	train's rmse: 1.19577	eval's rmse: 2.2147
[1022]	train's rmse: 1.19551	eval's rmse: 2.21586
[1023]	train's rmse: 1.19574	eval's rmse: 2.21611
[1024]	train's rmse: 1.19543	eval's rmse: 2.21682
[1

[32m[I 2021-03-03 13:43:10,486][0m Trial 3 finished with value: 2.7066110962834014 and parameters: {'boosting_type': 'rf', 'objective': 'regression_l1', 'learning_rate': 0.0533014032754706, 'lambda_l1': 0.0002463248716692248, 'lambda_l2': 0.3753937520568709, 'num_leaves': 32, 'feature_fraction': 0.610420768799961, 'bagging_fraction': 0.4228186601366216, 'bagging_freq': 20, 'min_child_samples': 30}. Best is trial 2 with value: 2.162389997711112.[0m


[872]	train's rmse: 2.72737	eval's rmse: 2.71504
[873]	train's rmse: 2.72735	eval's rmse: 2.71515
[874]	train's rmse: 2.72719	eval's rmse: 2.71493
[875]	train's rmse: 2.72667	eval's rmse: 2.71452
[876]	train's rmse: 2.72648	eval's rmse: 2.71449
[877]	train's rmse: 2.72657	eval's rmse: 2.71465
[878]	train's rmse: 2.72664	eval's rmse: 2.71479
[879]	train's rmse: 2.72645	eval's rmse: 2.71456
[880]	train's rmse: 2.72627	eval's rmse: 2.71454
[881]	train's rmse: 2.72646	eval's rmse: 2.7147
[882]	train's rmse: 2.72623	eval's rmse: 2.71474
[883]	train's rmse: 2.72636	eval's rmse: 2.71486
[884]	train's rmse: 2.72653	eval's rmse: 2.71495
[885]	train's rmse: 2.72662	eval's rmse: 2.7151
[886]	train's rmse: 2.7268	eval's rmse: 2.71527
[887]	train's rmse: 2.7266	eval's rmse: 2.71539
[888]	train's rmse: 2.72679	eval's rmse: 2.71557
[889]	train's rmse: 2.72659	eval's rmse: 2.71569
[890]	train's rmse: 2.72685	eval's rmse: 2.71601
[891]	train's rmse: 2.72694	eval's rmse: 2.71614
[892]	train's rmse: 2.72

[32m[I 2021-03-03 13:43:11,546][0m Trial 4 finished with value: 2.709447620367779 and parameters: {'boosting_type': 'rf', 'objective': 'regression_l2', 'learning_rate': 0.43185654174160837, 'lambda_l1': 0.08854477672343464, 'lambda_l2': 0.4379822702287925, 'num_leaves': 25, 'feature_fraction': 0.8678117552962211, 'bagging_fraction': 0.8789645639159706, 'bagging_freq': 3, 'min_child_samples': 51}. Best is trial 2 with value: 2.162389997711112.[0m


[541]	train's rmse: 2.62761	eval's rmse: 2.7188
[542]	train's rmse: 2.62778	eval's rmse: 2.7189
[543]	train's rmse: 2.62779	eval's rmse: 2.71884
[544]	train's rmse: 2.62797	eval's rmse: 2.7189
[545]	train's rmse: 2.62789	eval's rmse: 2.71878
[546]	train's rmse: 2.62785	eval's rmse: 2.71884
[547]	train's rmse: 2.62762	eval's rmse: 2.71889
[548]	train's rmse: 2.62736	eval's rmse: 2.71875
[549]	train's rmse: 2.62722	eval's rmse: 2.71864
[550]	train's rmse: 2.62697	eval's rmse: 2.71848
[551]	train's rmse: 2.62678	eval's rmse: 2.71837
[552]	train's rmse: 2.62659	eval's rmse: 2.71827
[553]	train's rmse: 2.62662	eval's rmse: 2.71832
[554]	train's rmse: 2.6267	eval's rmse: 2.71844
[555]	train's rmse: 2.62673	eval's rmse: 2.71849
[556]	train's rmse: 2.62685	eval's rmse: 2.71856
[557]	train's rmse: 2.62697	eval's rmse: 2.71864
[558]	train's rmse: 2.62693	eval's rmse: 2.71865
[559]	train's rmse: 2.62677	eval's rmse: 2.71854
[560]	train's rmse: 2.62667	eval's rmse: 2.71852
[561]	train's rmse: 2.62

In [6]:
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    "{}": {},'.format(key, value))

Best trial:
  Value: 2.162389997711112
  Params: 
    "boosting_type": gbdt,
    "objective": regression_l2,
    "learning_rate": 0.21460539214075433,
    "lambda_l1": 0.0005323187413301932,
    "lambda_l2": 0.0007250248525641071,
    "num_leaves": 201,
    "feature_fraction": 0.4798157817124993,
    "bagging_fraction": 0.718636691749577,
    "bagging_freq": 16,
    "min_child_samples": 93,


In [7]:
model_tuned = joblib.load('OPTIMIZED_MODEL.sav') #call saved model

In [8]:
df_test = pd.read_csv('test_data.csv')
X_test, y_test = StandardScaler().fit_transform(df_test.iloc[:,:-1]), df_test.iloc[:,-1]

In [9]:
pred_test = model_tuned.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred_test))
rmse

2.803532282459713

In [10]:
r2_score(y_test, pred_test)

0.6169359185359566