In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('new_data.csv')

In [3]:
# Selected features:
# Index(['datetime', 'prediction_unit_id', 'dewpoint', 'direct_solar_radiation',
#        'surface_solar_radiation_downwards', 'temperature',
#        'highest_price_per_mwh', 'avg_price', 'hour', 'euros_per_mwh',
#        'eic_count', 'installed_capacity', 'product_type_3', 'is_consumption_0',
#        'is_consumption_1'],
#       dtype='object')

In [4]:
data.columns

Index(['target', 'datetime', 'prediction_unit_id', 'year',
       '10_metre_u_wind_component', '10_metre_v_wind_component',
       'cloudcover_high', 'cloudcover_low', 'cloudcover_mid',
       'cloudcover_total', 'dewpoint', 'direct_solar_radiation', 'snowfall',
       'surface_solar_radiation_downwards', 'temperature',
       'total_precipitation', 'lowest_price_per_mwh', 'highest_price_per_mwh',
       'avg_price', 'hour', 'day', 'month', 'day_of_week', 'day_of_year',
       'euros_per_mwh', 'eic_count', 'installed_capacity', 'county_7',
       'county_8', 'county_10', 'county_11', 'county_13', 'county_14',
       'county_15', 'is_business_0', 'is_business_1', 'product_type_0',
       'product_type_1', 'product_type_2', 'product_type_3',
       'is_consumption_0', 'is_consumption_1'],
      dtype='object')

In [5]:
final_data = data[['datetime', 'prediction_unit_id', 'dewpoint', 'direct_solar_radiation',
       'surface_solar_radiation_downwards', 'temperature',
       'highest_price_per_mwh', 'avg_price', 'hour', 'euros_per_mwh',
       'eic_count', 'installed_capacity', 'product_type_3', 'is_consumption_0',
       'is_consumption_1','county_7',
       'county_8', 'county_10', 'county_11', 'county_13', 'county_14',
       'county_15', 'is_business_0', 'is_business_1', 'product_type_0',
       'product_type_1', 'product_type_2','target']]

In [6]:
cols = final_data.columns.difference(['target'])

## model building with selected features

In [7]:
from sklearn.model_selection import train_test_split

x_train , x_test , y_train , y_test = train_test_split(final_data[cols] ,final_data['target'] , random_state = 42 , test_size = 0.20)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(790036, 27)
(197510, 27)
(790036,)
(197510,)


In [14]:
import lightgbm as lgb 
callbacks = [
    lgb.early_stopping(stopping_rounds= 1000,  verbose=True),
    lgb.callback.log_evaluation(period=1000)
]

lgb_model = lgb.LGBMRegressor(  n_estimators = 19337 , max_depth = 22  , subsample = 0.8010925204759234 , learning_rate = 0.07972332158676076 , num_leaves = 77 )
lgb_model.fit(x_train, y_train , eval_set = [(x_test , y_test)] , callbacks = callbacks )

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 790036, number of used features: 27
[LightGBM] [Info] Start training from score 238.885044
Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's l2: 7981.99
[2000]	valid_0's l2: 6192.9
[3000]	valid_0's l2: 5588.09
[4000]	valid_0's l2: 5250.19
[5000]	valid_0's l2: 5030.81
[6000]	valid_0's l2: 4893.18
[7000]	valid_0's l2: 4804.05
[8000]	valid_0's l2: 4725.46
[9000]	valid_0's l2: 4676.04
[10000]	valid_0's l2: 4614.73
[11000]	valid_0's l2: 4575.43
[12000]	valid_0's l2: 4543.72
[13000]	valid_0's l2: 4515.93
[14000]	valid_0's l2: 4496.21
[15000]	valid_0's l2: 4470.19
[16000]	valid_0's l2: 4449.21
[17000]	valid_0's l2: 4432.73
[18000]	valid_0's l2: 4417.66
[19000]	valid_0's l2: 4405.49
Did not meet early stopping. Bes

In [15]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(final_data[cols] ,final_data['target'], test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 19000, 21000),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
        'num_leaves': trial.suggest_int('num_leaves', 50, 100),
        'max_depth': trial.suggest_int('max_depth', 15, 25),
        'subsample': trial.suggest_float('subsample', 0.7, 0.9)
    }
    
    lgb_reg = lgb.LGBMRegressor(**param, device='gpu')
    
    # Split the training data further for validation
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Define the callbacks
    callbacks = [
        lgb.early_stopping(stopping_rounds=100, verbose=False),
        lgb.log_evaluation(period=100)
    ]
    
    # Train the model with the specified parameters
    lgb_reg.fit(X_train_split, y_train_split, 
                eval_set=[(X_val_split, y_val_split)], 
                callbacks=callbacks)
    
    # Predict on the validation set
    y_pred = lgb_reg.predict(X_val_split)
    
    # Calculate the mean squared error
    mse = mean_squared_error(y_val_split, y_pred)
    
    return mse

# Create an Optuna study to minimize the objective function
study = optuna.create_study(direction='minimize')

# Optimize the study over 10 trials
study.optimize(objective, n_trials=10)

# Retrieve and print the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)



[I 2024-08-07 22:58:15,565] A new study created in memory with name: no-name-9b344568-3d05-4868-9dd0-749f03f6fdbe


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.013248 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 18099.6
[200]	valid_0's l2: 14764.7
[300]	valid_0's l2: 12830.7
[400]	valid_0's l2: 11540
[500]	valid_0's l2: 10614.1
[600]	valid_0's l2: 9862.44
[700]	valid_0's l2: 9331.91
[800]	valid_0's l2: 8929.57
[900]	valid_0's l2: 8584.15
[1000]	valid_0's l2: 8262.88
[1100]	valid_0's l2: 7985.81
[1200]	valid_0's l2: 7793.52
[1300]	valid_0's l2: 7604.69
[1400]	valid_0's l2: 7446.05
[1500]	valid_0's l2: 7332.67


[I 2024-08-07 23:19:38,618] Trial 0 finished with value: 5438.590661976564 and parameters: {'n_estimators': 19419, 'learning_rate': 0.09876663205160877, 'num_leaves': 71, 'max_depth': 25, 'subsample': 0.7949181963838825}. Best is trial 0 with value: 5438.590661976564.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.013791 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 20125.5
[200]	valid_0's l2: 16758
[300]	valid_0's l2: 14835.2
[400]	valid_0's l2: 13551.6
[500]	valid_0's l2: 12496.4
[600]	valid_0's l2: 11522.6
[700]	valid_0's l2: 10863.1
[800]	valid_0's l2: 10393.1
[900]	valid_0's l2: 9934.86
[1000]	valid_0's l2: 9512.12
[1100]	valid_0's l2: 9184.39
[1200]	valid_0's l2: 8858.72
[1300]	valid_0's l2: 8602.43
[1400]	valid_0's l2: 8380.01
[1500]	valid_0's l2: 8203.42


[I 2024-08-07 23:43:13,521] Trial 1 finished with value: 5535.943931235899 and parameters: {'n_estimators': 19464, 'learning_rate': 0.07635320653236143, 'num_leaves': 68, 'max_depth': 17, 'subsample': 0.8970016610469658}. Best is trial 0 with value: 5438.590661976564.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.013098 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 19690.6
[200]	valid_0's l2: 16203.1
[300]	valid_0's l2: 14259.5
[400]	valid_0's l2: 13077.6
[500]	valid_0's l2: 12025.7
[600]	valid_0's l2: 11237.6
[700]	valid_0's l2: 10579.2
[800]	valid_0's l2: 9994.87
[900]	valid_0's l2: 9452.64
[1000]	valid_0's l2: 9079.58
[1100]	valid_0's l2: 8803.45
[1200]	valid_0's l2: 8540.9
[1300]	valid_0's l2: 8314.32
[1400]	valid_0's l2: 8137.06
[1500]	valid_0's l2: 7945.43

[I 2024-08-07 23:58:35,104] Trial 2 finished with value: 5447.012327805529 and parameters: {'n_estimators': 19203, 'learning_rate': 0.070849130520241, 'num_leaves': 77, 'max_depth': 19, 'subsample': 0.7795021965165028}. Best is trial 0 with value: 5438.590661976564.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.014227 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 19901.3
[200]	valid_0's l2: 16443.9
[300]	valid_0's l2: 14180.5
[400]	valid_0's l2: 12689.3
[500]	valid_0's l2: 11651.6
[600]	valid_0's l2: 10875
[700]	valid_0's l2: 10281.1
[800]	valid_0's l2: 9693.2
[900]	valid_0's l2: 9252.53
[1000]	valid_0's l2: 8895.64
[1100]	valid_0's l2: 8593.39
[1200]	valid_0's l2: 8302.42
[1300]	valid_0's l2: 8041.78
[1400]	valid_0's l2: 7870.22
[1500]	valid_0's l2: 7724.04
[

[I 2024-08-08 07:26:13,422] Trial 3 finished with value: 5461.9344993920995 and parameters: {'n_estimators': 20028, 'learning_rate': 0.10640715439003395, 'num_leaves': 54, 'max_depth': 20, 'subsample': 0.8516826522864933}. Best is trial 0 with value: 5438.590661976564.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.013279 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 15839.8
[200]	valid_0's l2: 12614.2
[300]	valid_0's l2: 10786.8
[400]	valid_0's l2: 9703.12
[500]	valid_0's l2: 8931.03
[600]	valid_0's l2: 8365.61
[700]	valid_0's l2: 7983.07
[800]	valid_0's l2: 7705.87
[900]	valid_0's l2: 7471.75
[1000]	valid_0's l2: 7294.19
[1100]	valid_0's l2: 7119.5
[1200]	valid_0's l2: 7021.46
[1300]	valid_0's l2: 6871.5
[1400]	valid_0's l2: 6791.7
[1500]	valid_0's l2: 6710.46
[

[I 2024-08-08 07:31:40,138] Trial 4 finished with value: 5746.774838231248 and parameters: {'n_estimators': 19978, 'learning_rate': 0.147644912630027, 'num_leaves': 85, 'max_depth': 16, 'subsample': 0.7064152020158668}. Best is trial 0 with value: 5438.590661976564.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.014389 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 18018.7
[200]	valid_0's l2: 14807.3
[300]	valid_0's l2: 12955
[400]	valid_0's l2: 11733.3
[500]	valid_0's l2: 10779.6
[600]	valid_0's l2: 9967.83
[700]	valid_0's l2: 9374.97
[800]	valid_0's l2: 8906.37
[900]	valid_0's l2: 8543.84
[1000]	valid_0's l2: 8296.43
[1100]	valid_0's l2: 8045.43
[1200]	valid_0's l2: 7832.93
[1300]	valid_0's l2: 7692.79
[1400]	valid_0's l2: 7504.61
[1500]	valid_0's l2: 7346.78


[I 2024-08-08 07:48:52,133] Trial 5 finished with value: 5382.173683349831 and parameters: {'n_estimators': 20881, 'learning_rate': 0.08382248613766802, 'num_leaves': 88, 'max_depth': 16, 'subsample': 0.7578663539704151}. Best is trial 5 with value: 5382.173683349831.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.013555 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 16030.4
[200]	valid_0's l2: 12901.2
[300]	valid_0's l2: 11242.6
[400]	valid_0's l2: 10090.7
[500]	valid_0's l2: 9261.99
[600]	valid_0's l2: 8596.8
[700]	valid_0's l2: 8123.99
[800]	valid_0's l2: 7832.98
[900]	valid_0's l2: 7565.4
[1000]	valid_0's l2: 7372.97
[1100]	valid_0's l2: 7214.77
[1200]	valid_0's l2: 7055.97
[1300]	valid_0's l2: 6962.45
[1400]	valid_0's l2: 6848.91
[1500]	valid_0's l2: 6756.14


[I 2024-08-08 08:08:19,896] Trial 6 finished with value: 5454.016919378544 and parameters: {'n_estimators': 19020, 'learning_rate': 0.12141400045359926, 'num_leaves': 89, 'max_depth': 21, 'subsample': 0.8098982370675857}. Best is trial 5 with value: 5382.173683349831.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.012343 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 20032.1
[200]	valid_0's l2: 16574.6
[300]	valid_0's l2: 14435.9
[400]	valid_0's l2: 12883
[500]	valid_0's l2: 11811.1
[600]	valid_0's l2: 11069.4
[700]	valid_0's l2: 10358.9
[800]	valid_0's l2: 9901.15
[900]	valid_0's l2: 9547
[1000]	valid_0's l2: 9220.68
[1100]	valid_0's l2: 8876.71
[1200]	valid_0's l2: 8635.2
[1300]	valid_0's l2: 8464.35
[1400]	valid_0's l2: 8268.78
[1500]	valid_0's l2: 8084.04
[160

[I 2024-08-08 08:27:41,657] Trial 7 finished with value: 5591.755353377849 and parameters: {'n_estimators': 20037, 'learning_rate': 0.0982863292814033, 'num_leaves': 59, 'max_depth': 24, 'subsample': 0.8651460050295541}. Best is trial 5 with value: 5382.173683349831.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.014146 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 15569.9
[200]	valid_0's l2: 12250.8
[300]	valid_0's l2: 10550.9
[400]	valid_0's l2: 9609.51
[500]	valid_0's l2: 8890.38
[600]	valid_0's l2: 8436.25
[700]	valid_0's l2: 8083.4
[800]	valid_0's l2: 7798.57
[900]	valid_0's l2: 7541.32
[1000]	valid_0's l2: 7340.8
[1100]	valid_0's l2: 7144.85
[1200]	valid_0's l2: 7017.87
[1300]	valid_0's l2: 6923.27
[1400]	valid_0's l2: 6835.18
[1500]	valid_0's l2: 6736.92


[I 2024-08-08 08:52:53,387] Trial 8 finished with value: 5517.524454733079 and parameters: {'n_estimators': 20831, 'learning_rate': 0.13921947188353556, 'num_leaves': 85, 'max_depth': 21, 'subsample': 0.7972072277184217}. Best is trial 5 with value: 5382.173683349831.


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 632028, number of used features: 27
[LightGBM] [Info] Using GPU Device: gfx90c, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 16 dense feature groups (9.64 MB) transferred to GPU in 0.014122 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 239.008478
[100]	valid_0's l2: 21260.9
[200]	valid_0's l2: 18052.4
[300]	valid_0's l2: 15634.5
[400]	valid_0's l2: 14118.9
[500]	valid_0's l2: 12891.7
[600]	valid_0's l2: 11955.3
[700]	valid_0's l2: 11255.8
[800]	valid_0's l2: 10666.7
[900]	valid_0's l2: 10179.8
[1000]	valid_0's l2: 9827.9
[1100]	valid_0's l2: 9492.42
[1200]	valid_0's l2: 9178.33
[1300]	valid_0's l2: 8947.3
[1400]	valid_0's l2: 8703.21
[1500]	valid_0's l2: 8476.3
[

[I 2024-08-08 09:12:14,172] Trial 9 finished with value: 5458.500086004499 and parameters: {'n_estimators': 19866, 'learning_rate': 0.07725122175868585, 'num_leaves': 58, 'max_depth': 19, 'subsample': 0.8850929403775765}. Best is trial 5 with value: 5382.173683349831.


Best Hyperparameters: {'n_estimators': 20881, 'learning_rate': 0.08382248613766802, 'num_leaves': 88, 'max_depth': 16, 'subsample': 0.7578663539704151}


In [8]:
import lightgbm as lgb 
callbacks = [
    lgb.early_stopping(stopping_rounds= 1000,  verbose=True),
    lgb.callback.log_evaluation(period=1000)
]

lgb_model = lgb.LGBMRegressor(  n_estimators = 20881 , max_depth = 16  , subsample = 0.7578663539704151 , learning_rate = 0.08382248613766802 , num_leaves = 88 )
lgb_model.fit(x_train, y_train , eval_set = [(x_test , y_test)] , callbacks = callbacks )

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064574 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2609
[LightGBM] [Info] Number of data points in the train set: 790036, number of used features: 27
[LightGBM] [Info] Start training from score 238.885044
Training until validation scores don't improve for 1000 rounds
[1000]	valid_0's l2: 7447.23
[2000]	valid_0's l2: 6028.49
[3000]	valid_0's l2: 5423.97
[4000]	valid_0's l2: 5138.22
[5000]	valid_0's l2: 4954.2
[6000]	valid_0's l2: 4834.62
[7000]	valid_0's l2: 4750.14
[8000]	valid_0's l2: 4687.37
[9000]	valid_0's l2: 4644.01
[10000]	valid_0's l2: 4594.59
[11000]	valid_0's l2: 4559.55
[12000]	valid_0's l2: 4529.27
[13000]	valid_0's l2: 4502.79
[14000]	valid_0's l2: 4477.68
[15000]	valid_0's l2: 4456.31
[16000]	valid_0's l2: 4436.9
[17000]	valid_0's l2: 4424.44
[18000]	valid_0's l2: 4410.65
[19000]	valid_0's l2: 4398.12
[20000]	valid_0's l2: 4386.88
Did

In [9]:
lgb_train_preds = lgb_model.predict(x_train)

In [10]:
lgb_test_preds = lgb_model.predict(x_test)

In [11]:
from sklearn.metrics import mean_squared_error , mean_absolute_error

MSE_test = mean_squared_error(y_test, lgb_test_preds)

In [12]:
MSE_test

4378.872286021275

In [13]:
# print the values of RMSE for train and test
print( 'RMSE of testing data: ', np.sqrt(MSE_test))

print()

mae_test = mean_absolute_error(y_test,lgb_test_preds)
print('Mean absolute error' , mae_test) 

RMSE of testing data:  66.17304803332905

Mean absolute error 23.197948456534135


In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
# rf = RandomForestRegressor(n_estimators=50, random_state=42)

In [16]:
# Define the model with additional hyperparameters
rf = RandomForestRegressor(
    n_estimators=50,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    ccp_alpha=0.01
)

In [17]:
rf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.2min finished


In [18]:
rf_train_preds = rf.predict(x_train)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    2.1s finished


In [19]:
rf_test_preds = rf.predict(x_test)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done  50 out of  50 | elapsed:    0.5s finished


In [20]:
mae_test_rf = mean_absolute_error(y_test,rf_test_preds)
print('Mean absolute error' , mae_test_rf)

Mean absolute error 38.47703022776938


# Implementing hybrid model

In [21]:
# Combine predictions as new features
stack_train = np.column_stack((lgb_train_preds, rf_train_preds))
stack_test = np.column_stack((lgb_test_preds, rf_test_preds))


In [22]:
# Train meta-model
from sklearn.linear_model import LinearRegression
meta_model = LinearRegression()
meta_model.fit(stack_train, y_train)
stacked_preds = meta_model.predict(stack_test)

In [29]:
# Evaluate the performance
mae_hybrid_1 = mean_absolute_error(y_test, stacked_preds)
print(f"Mean Absolute Error with Linear Regression as meta-model: {mae_hybrid_1}")

# Compare with LightGBM model
mae_lgb = mean_absolute_error(y_test, lgb_test_preds)
print(f"Mean Absolute Error of LightGBM model: {mae_lgb}")

# Compare with Random Forest model
mae_rf = mean_absolute_error(y_test, rf_test_preds)
print(f"Mean Absolute Error of Random Forest model: {mae_rf}")

Mean Absolute Error with Linear Regression as meta-model: 23.207452476964107
Mean Absolute Error of LightGBM model: 23.197948456534135
Mean Absolute Error of Random Forest model: 38.47703022776938


### Ridge Regression as meta model

In [25]:
# # Train a new meta-model
from sklearn.linear_model import Ridge

meta_model_ridge = Ridge(alpha=1.0)
meta_model_ridge.fit(stack_train, y_train)
stacked_preds_ridge = meta_model_ridge.predict(stack_test)

In [30]:
# Evaluate the performance
mae_hybrid_2 = mean_absolute_error(y_test, stacked_preds_ridge)
print(f"Mean Absolute Error with Ridge Regression as meta-model: {mae_hybrid_2}")

# Compare with LightGBM model
mae_lgb = mean_absolute_error(y_test, lgb_test_preds)
print(f"Mean Absolute Error of LightGBM model: {mae_lgb}")

# Compare with Random Forest model
mae_rf = mean_absolute_error(y_test, rf_test_preds)
print(f"Mean Absolute Error of Random Forest model: {mae_rf}")

Mean Absolute Error with Ridge Regression as meta-model: 23.207452476683546
Mean Absolute Error of LightGBM model: 23.197948456534135
Mean Absolute Error of Random Forest model: 38.47703022776938


### Gradient Boosting as meta model

In [27]:
# # Train a new meta-model
from sklearn.ensemble import GradientBoostingRegressor

meta_model_gbr = GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, random_state=42)
meta_model_gbr.fit(stack_train, y_train)
stacked_preds_gbr = meta_model_gbr.predict(stack_test)

In [31]:
# Evaluate the performance
mae_gradient_1 = mean_absolute_error(y_test, stacked_preds_gbr)
print(f"Mean Absolute Error with Gradient Boosting as meta-model: {mae_gradient_1}")

# Compare with LightGBM model
mae_lgb = mean_absolute_error(y_test, lgb_test_preds)
print(f"Mean Absolute Error of LightGBM model: {mae_lgb}")

# Compare with Random Forest model
mae_rf = mean_absolute_error(y_test, rf_test_preds)
print(f"Mean Absolute Error of Random Forest model: {mae_rf}")

Mean Absolute Error with Gradient Boosting as meta-model: 115.84191093848729
Mean Absolute Error of LightGBM model: 23.197948456534135
Mean Absolute Error of Random Forest model: 38.47703022776938
