In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna, joblib

In [42]:
sc = StandardScaler()

In [43]:
### create an objective function 
def objective(
    trial #this is the ONLY arguement is default and a must
    ):
    
    #get data
    df_train = pd.read_csv('train_data.csv')
    df_test = pd.read_csv('test_data.csv')
              
    #separate X and y
    X_train, y_train = df_train.iloc[:,:-1], df_train.iloc[:,-1]
    X_test, y_test = df_test.iloc[:,:-1], df_test.iloc[:,-1]
    
    X_train_sc = sc.fit_transform(X_train)
    X_test_sc = sc.transform(X_test)
    
    dtrain = lgb.Dataset(X_train_sc, label = y_train)
    dvalid = lgb.Dataset(X_test_sc, label = y_test)
    
    # in optuna, setting of parameters is as follows:
    '''
    dictionary = {
    hyperparameter_alias: trial.suggest_distribution(
                           'hyperparameter_alias',
                           ['distribution values]'
                           )
    }    
    
    distribution to choose from:
    
    uniform — float values
    loguniform — float values
    discrete_uniform — float values with intervals
    int — integer values
    categorical — categorical values from a list
    '''
    
    #example:
    params = {
        'task': 'train',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'rf']),
        'metric': 'rmse',
        'objective': trial.suggest_categorical('objective', ['regression_l1', 'regression_l2']),
        'verbosity': -1,
        "seed": 42,
        "learning_rate": trial.suggest_loguniform('learning_rate', 0.05, 1),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }        

    model = lgb.train(
            params, 
            dtrain, 
            valid_names = ['eval', 'train'], 
            valid_sets = [dvalid, dtrain],
            early_stopping_rounds = 500,
            num_boost_round  = 10000,
            )

    joblib.dump(model, 'OPTIMIZED_MODEL.sav') #save model for future use

    prediction = model.predict(sc.transform(X_test))
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    return rmse

In [44]:
#optimize hyper param
study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 5)

[32m[I 2021-03-01 18:24:08,139][0m A new study created in memory with name: no-name-5d46321b-9b5f-462d-9f6a-db72f0d71a9f[0m


[1]	train's rmse: 3.546	eval's rmse: 3.44352
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 3.546	eval's rmse: 3.44352
[3]	train's rmse: 3.546	eval's rmse: 3.44352
[4]	train's rmse: 3.546	eval's rmse: 3.44352
[5]	train's rmse: 3.546	eval's rmse: 3.44352
[6]	train's rmse: 3.37954	eval's rmse: 3.33883
[7]	train's rmse: 3.29595	eval's rmse: 3.30141
[8]	train's rmse: 3.31775	eval's rmse: 3.30926
[9]	train's rmse: 3.33667	eval's rmse: 3.31738
[10]	train's rmse: 3.35302	eval's rmse: 3.32513
[11]	train's rmse: 3.34978	eval's rmse: 3.32143
[12]	train's rmse: 3.36291	eval's rmse: 3.32829
[13]	train's rmse: 3.37449	eval's rmse: 3.33458
[14]	train's rmse: 3.37299	eval's rmse: 3.33313
[15]	train's rmse: 3.38239	eval's rmse: 3.3384
[16]	train's rmse: 3.36519	eval's rmse: 3.32631
[17]	train's rmse: 3.35732	eval's rmse: 3.32023
[18]	train's rmse: 3.34737	eval's rmse: 3.31321
[19]	train's rmse: 3.31742	eval's rmse: 3.29641
[20]	train's rmse: 3.31129	eval's rmse: 3.2921

[32m[I 2021-03-01 18:24:08,887][0m Trial 0 finished with value: 3.2005640874029524 and parameters: {'boosting_type': 'rf', 'objective': 'regression_l1', 'learning_rate': 0.06983581417697698, 'lambda_l1': 1.0196583464215431e-05, 'lambda_l2': 5.057069422650611e-05, 'num_leaves': 112, 'feature_fraction': 0.792022578654222, 'bagging_fraction': 0.4987087125616879, 'bagging_freq': 15, 'min_child_samples': 95}. Best is trial 0 with value: 3.2005640874029524.[0m



[650]	train's rmse: 3.26612	eval's rmse: 3.21458
[651]	train's rmse: 3.26582	eval's rmse: 3.21459
[652]	train's rmse: 3.26599	eval's rmse: 3.21476
[653]	train's rmse: 3.2661	eval's rmse: 3.21499
[654]	train's rmse: 3.2662	eval's rmse: 3.21522
[655]	train's rmse: 3.26631	eval's rmse: 3.21545
[656]	train's rmse: 3.26601	eval's rmse: 3.21546
[657]	train's rmse: 3.26618	eval's rmse: 3.21563
[658]	train's rmse: 3.26588	eval's rmse: 3.21564
[659]	train's rmse: 3.26599	eval's rmse: 3.21587
[660]	train's rmse: 3.26608	eval's rmse: 3.21606
[661]	train's rmse: 3.26628	eval's rmse: 3.2164
[662]	train's rmse: 3.26649	eval's rmse: 3.21673
[663]	train's rmse: 3.2664	eval's rmse: 3.2168
[664]	train's rmse: 3.2666	eval's rmse: 3.21713
[665]	train's rmse: 3.2668	eval's rmse: 3.21746
[666]	train's rmse: 3.26701	eval's rmse: 3.2178
[667]	train's rmse: 3.26681	eval's rmse: 3.21771
[668]	train's rmse: 3.26673	eval's rmse: 3.21778
[669]	train's rmse: 3.26654	eval's rmse: 3.2177
[670]	train's rmse: 3.26646	

[32m[I 2021-03-01 18:24:10,796][0m Trial 1 finished with value: 2.871052365596402 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l1', 'learning_rate': 0.2324097512681303, 'lambda_l1': 1.121365957702875, 'lambda_l2': 0.13377361048733025, 'num_leaves': 42, 'feature_fraction': 0.1403047576256134, 'bagging_fraction': 0.26037024774245543, 'bagging_freq': 2, 'min_child_samples': 65}. Best is trial 1 with value: 2.871052365596402.[0m


[1]	train's rmse: 4.49336	eval's rmse: 4.39817
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 4.18172	eval's rmse: 4.10337
[3]	train's rmse: 3.96956	eval's rmse: 3.89368
[4]	train's rmse: 3.7894	eval's rmse: 3.74807
[5]	train's rmse: 3.60011	eval's rmse: 3.56981
[6]	train's rmse: 3.45843	eval's rmse: 3.43815
[7]	train's rmse: 3.35836	eval's rmse: 3.355
[8]	train's rmse: 3.25764	eval's rmse: 3.26638
[9]	train's rmse: 3.17322	eval's rmse: 3.19135
[10]	train's rmse: 3.08825	eval's rmse: 3.10943
[11]	train's rmse: 3.05312	eval's rmse: 3.08916
[12]	train's rmse: 2.98343	eval's rmse: 3.02282
[13]	train's rmse: 2.95009	eval's rmse: 2.99602
[14]	train's rmse: 2.91041	eval's rmse: 2.96415
[15]	train's rmse: 2.8831	eval's rmse: 2.94704
[16]	train's rmse: 2.83647	eval's rmse: 2.91056
[17]	train's rmse: 2.81331	eval's rmse: 2.89872
[18]	train's rmse: 2.77154	eval's rmse: 2.86014
[19]	train's rmse: 2.74635	eval's rmse: 2.84063
[20]	train's rmse: 2.7222	eval's rmse: 

[32m[I 2021-03-01 18:24:12,300][0m Trial 2 finished with value: 2.1534902491216235 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l2', 'learning_rate': 0.13380461506159247, 'lambda_l1': 6.269424088777498e-05, 'lambda_l2': 0.00024443697423417006, 'num_leaves': 35, 'feature_fraction': 0.3641785013506469, 'bagging_fraction': 0.8492112179731501, 'bagging_freq': 18, 'min_child_samples': 93}. Best is trial 2 with value: 2.1534902491216235.[0m


[1]	train's rmse: 4.1691	eval's rmse: 4.20184
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 3.54641	eval's rmse: 3.52549
[3]	train's rmse: 3.70063	eval's rmse: 3.67258
[4]	train's rmse: 3.76581	eval's rmse: 3.71064
[5]	train's rmse: 3.84225	eval's rmse: 3.78416
[6]	train's rmse: 3.68364	eval's rmse: 3.62078
[7]	train's rmse: 3.75272	eval's rmse: 3.69876
[8]	train's rmse: 3.77548	eval's rmse: 3.725
[9]	train's rmse: 3.81089	eval's rmse: 3.76121
[10]	train's rmse: 3.71982	eval's rmse: 3.66926
[11]	train's rmse: 3.75863	eval's rmse: 3.70905
[12]	train's rmse: 3.77334	eval's rmse: 3.71881
[13]	train's rmse: 3.80371	eval's rmse: 3.74697
[14]	train's rmse: 3.81932	eval's rmse: 3.77246
[15]	train's rmse: 3.84453	eval's rmse: 3.79514
[16]	train's rmse: 3.84986	eval's rmse: 3.79773
[17]	train's rmse: 3.85087	eval's rmse: 3.80379
[18]	train's rmse: 3.79691	eval's rmse: 3.74936
[19]	train's rmse: 3.81604	eval's rmse: 3.76939
[20]	train's rmse: 3.82198	eval's rmse

[32m[I 2021-03-01 18:24:12,909][0m Trial 3 finished with value: 3.5254855087207218 and parameters: {'boosting_type': 'rf', 'objective': 'regression_l2', 'learning_rate': 0.05303466597478119, 'lambda_l1': 0.006427576696388673, 'lambda_l2': 0.09824413579498853, 'num_leaves': 204, 'feature_fraction': 0.15031930980713837, 'bagging_fraction': 0.4409216088908864, 'bagging_freq': 4, 'min_child_samples': 57}. Best is trial 2 with value: 2.1534902491216235.[0m


[1]	train's rmse: 3.9166	eval's rmse: 4.00679
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 3.09915	eval's rmse: 3.18575
[3]	train's rmse: 2.80521	eval's rmse: 2.93755
[4]	train's rmse: 2.6257	eval's rmse: 2.78254
[5]	train's rmse: 2.53033	eval's rmse: 2.69284
[6]	train's rmse: 2.43482	eval's rmse: 2.64177
[7]	train's rmse: 2.37833	eval's rmse: 2.59208
[8]	train's rmse: 2.35639	eval's rmse: 2.58561
[9]	train's rmse: 2.33033	eval's rmse: 2.58188
[10]	train's rmse: 2.31841	eval's rmse: 2.58329
[11]	train's rmse: 2.28776	eval's rmse: 2.56368
[12]	train's rmse: 2.27084	eval's rmse: 2.56315
[13]	train's rmse: 2.24877	eval's rmse: 2.55405
[14]	train's rmse: 2.23853	eval's rmse: 2.5407
[15]	train's rmse: 2.22592	eval's rmse: 2.54104
[16]	train's rmse: 2.16619	eval's rmse: 2.52294
[17]	train's rmse: 2.13557	eval's rmse: 2.51777
[18]	train's rmse: 2.09738	eval's rmse: 2.50269
[19]	train's rmse: 2.06041	eval's rmse: 2.50686
[20]	train's rmse: 2.04207	eval's rmse

[32m[I 2021-03-01 18:24:13,825][0m Trial 4 finished with value: 2.439538156165597 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l2', 'learning_rate': 0.5600098332030682, 'lambda_l1': 0.004830994973960632, 'lambda_l2': 0.6086982529508307, 'num_leaves': 203, 'feature_fraction': 0.47712597923724565, 'bagging_fraction': 0.4378699368599106, 'bagging_freq': 15, 'min_child_samples': 20}. Best is trial 2 with value: 2.1534902491216235.[0m


[511]	train's rmse: 0.507772	eval's rmse: 2.54973
[512]	train's rmse: 0.509799	eval's rmse: 2.54877
[513]	train's rmse: 0.513249	eval's rmse: 2.55019
[514]	train's rmse: 0.514279	eval's rmse: 2.54647
[515]	train's rmse: 0.518865	eval's rmse: 2.54937
[516]	train's rmse: 0.520861	eval's rmse: 2.54787
[517]	train's rmse: 0.520434	eval's rmse: 2.54619
[518]	train's rmse: 0.520821	eval's rmse: 2.5445
[519]	train's rmse: 0.525233	eval's rmse: 2.5445
[520]	train's rmse: 0.525333	eval's rmse: 2.54593
[521]	train's rmse: 0.525996	eval's rmse: 2.5475
[522]	train's rmse: 0.527029	eval's rmse: 2.55047
[523]	train's rmse: 0.525433	eval's rmse: 2.55053
[524]	train's rmse: 0.526672	eval's rmse: 2.55131
[525]	train's rmse: 0.525505	eval's rmse: 2.55176
[526]	train's rmse: 0.509456	eval's rmse: 2.54634
[527]	train's rmse: 0.504148	eval's rmse: 2.54659
[528]	train's rmse: 0.502016	eval's rmse: 2.54903
[529]	train's rmse: 0.502684	eval's rmse: 2.55192
[530]	train's rmse: 0.50085	eval's rmse: 2.55143
[531

In [45]:
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    "{}": {},'.format(key, value))

Best trial:
  Value: 2.1534902491216235
  Params: 
    "boosting_type": gbdt,
    "objective": regression_l2,
    "learning_rate": 0.13380461506159247,
    "lambda_l1": 6.269424088777498e-05,
    "lambda_l2": 0.00024443697423417006,
    "num_leaves": 35,
    "feature_fraction": 0.3641785013506469,
    "bagging_fraction": 0.8492112179731501,
    "bagging_freq": 18,
    "min_child_samples": 93,


In [47]:
model_tuned = joblib.load('OPTIMIZED_MODEL.sav') #call saved model

In [48]:
df_test = pd.read_csv('test_data.csv')
X_test, y_test = StandardScaler().fit_transform(df_test.iloc[:,:-1]), df_test.iloc[:,-1]

In [49]:
pred_test = model_tuned.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, pred_test))
rmse

2.6578897939956976

In [50]:
r2_score(y_test, pred_test)

0.6557021996973864