In [82]:
import os
import logging
import optuna
import pandas as pd
from copy import deepcopy
from sklearn.metrics import mean_pinball_loss
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
import joblib

logging.basicConfig(level=logging.INFO)

# Set paths
BASE_PATH = os.getenv("BASE_PATH", "/Users/florian/Documents/github/DP2/Energy_production_price_prediction/") 
DATA_PATH = os.path.join(BASE_PATH, "Generation_forecast/Solar_forecast/data/wind_training_set.csv")    
FILEPATH_STUDY = os.path.join(BASE_PATH, "Generation_forecast/Solar_forecast/models/lgbr_model/logs")
MODEL_SAVE_PATH = os.path.join(BASE_PATH, "Generation_forecast/Solar_forecast/models/lgbr_model/models")

In [83]:
energy_data_20200920_20240118 = pd.read_csv("/Users/florian/Documents/github/DP2/Energy_production_price_prediction/HEFTcom24/data/energy_data/Energy_Data_20200920_20240118.csv")
energy_data_20240119_20240519 = pd.read_csv("/Users/florian/Documents/github/DP2/Energy_production_price_prediction/HEFTcom24/data/energy_data/Energy_data_20240119_20240519.csv")

In [84]:
energy_data_old = energy_data_20200920_20240118.copy()
energy_data_new = energy_data_20240119_20240519.copy()

energy_data_old["dtm"] = pd.to_datetime(energy_data_old["dtm"])
energy_data_old["Wind_MWh_credit"] = 0.5 * energy_data_old["Wind_MW"] - energy_data_old["boa_MWh"]
energy_data_old["Solar_MWh_credit"] = 0.5 * energy_data_old["Solar_MW"]

energy_data_new["dtm"] = pd.to_datetime(energy_data_new["dtm"])
energy_data_new["Wind_MWh_credit"] = 0.5 * energy_data_new["Wind_MW"] - energy_data_new["boa_MWh"]
energy_data_new["Solar_MWh_credit"] = 0.5 * energy_data_new["Solar_MW"]

energy_data_merged = pd.concat([energy_data_old, energy_data_new], ignore_index=True)

In [85]:
energy_data = energy_data_merged.copy()

In [86]:
energy_data

Unnamed: 0,dtm,MIP,Solar_MW,Solar_capacity_mwp,Solar_installedcapacity_mwp,Wind_MW,SS_Price,boa_MWh,DA_Price,Wind_MWh_credit,Solar_MWh_credit
0,2020-09-20 00:00:00+00:00,20.06,0.0,2130.537493,2228.208777,996.284,2.50000,0.0,32.17,498.142,0.0
1,2020-09-20 00:30:00+00:00,19.77,0.0,2130.537493,2228.208777,957.576,15.00000,0.0,32.17,478.788,0.0
2,2020-09-20 01:00:00+00:00,28.68,0.0,2130.537493,2228.208777,941.044,47.95000,0.0,32.00,470.522,0.0
3,2020-09-20 01:30:00+00:00,28.97,0.0,2130.537493,2228.208777,964.366,29.13000,0.0,32.00,482.183,0.0
4,2020-09-20 02:00:00+00:00,28.19,0.0,2130.537493,2228.208777,918.432,28.95000,0.0,31.99,459.216,0.0
...,...,...,...,...,...,...,...,...,...,...,...
64219,2024-05-19 21:30:00+00:00,77.40,0.0,2573.542092,2741.831037,227.518,56.20000,0.0,76.81,113.759,0.0
64220,2024-05-19 22:00:00+00:00,80.82,0.0,2573.542092,2741.831037,197.090,102.00023,0.0,71.11,98.545,0.0
64221,2024-05-19 22:30:00+00:00,75.69,0.0,2573.542092,2741.831037,171.306,99.00000,0.0,71.11,85.653,0.0
64222,2024-05-19 23:00:00+00:00,78.98,0.0,2573.542092,2741.831037,163.164,106.44988,0.0,66.51,81.582,0.0


In [87]:
energy_data = energy_data.dropna(subset= ["MIP", "Solar_MW", "Wind_MW", "boa_MWh", "Wind_MWh_credit", "Solar_MWh_credit", "Solar_capacity_mwp", "Solar_installedcapacity_mwp", "SS_Price", "DA_Price"])

In [88]:
data = pd.read_csv(DATA_PATH)

In [89]:
df = deepcopy(data)
df["valid_time"] = pd.to_datetime(df["valid_time"])

In [90]:
df = df.merge(energy_data, left_on="valid_time", right_on="dtm", how="inner")

In [91]:
import numpy as np
df["cos_hour"] = np.cos(2 * np.pi * df["valid_time"].dt.hour / 24)
df["cos_day"] = np.cos(2 * np.pi * df["valid_time"].dt.dayofyear / 365)

In [92]:
df["Wind_MW"] = df["Wind_MW_y"]

In [93]:
df = df.drop(columns= ["dtm", "valid_time", "reference_time", "MIP", "Solar_MW", "Wind_MW_y", "Wind_MW_x", "Solar_capacity_mwp", "Solar_installedcapacity_mwp", "SS_Price", "DA_Price", "Solar_MWh_credit", "boa_MWh", "Wind_MW"])

In [94]:
df.columns

Index(['WindSpeed:100_dwd', 'Temperature_avg', 'RelativeHumidity_avg',
       'AirDensity', 'UsableWindPower_opt', 'WindSpeed:100_dwd_lag1',
       'WindSpeed:100_dwd_lag2', 'WindSpeed:100_dwd_lag3', 'residual',
       'PowerOutput_opt', 'Wind_MWh_credit', 'cos_hour', 'cos_day'],
      dtype='object')

In [95]:
df = df[['WindSpeed:100_dwd', 'Temperature_avg',
       'RelativeHumidity_avg', 'AirDensity', 'UsableWindPower_opt',
       'WindSpeed:100_dwd_lag1', 'WindSpeed:100_dwd_lag2',
       'WindSpeed:100_dwd_lag3', 'PowerOutput_opt', 'Wind_MWh_credit', 'residual'
       ]]

In [96]:
df.columns = [col.replace(":", "_") for col in df.columns]  

In [97]:
df["residual"] = df["residual"] / 2
df["PowerOutput_opt"] = df["PowerOutput_opt"] / 2  
df["WindSpeed_100_dwd_std"] = abs(df["WindSpeed_100_dwd"] - df["WindSpeed_100_dwd"].mean())
#df["Wind_MWh_credit_Lag_48h"] = df["Wind_MWh_credit"].shift(96) 

In [98]:
# df["WinsSpeed_100_dwd_2"] = df["WindSpeed_100_dwd"] ** 2
# df["WindSpeed_100_dwd_3"] = df["WindSpeed_100_dwd"] ** 3

In [99]:
df

Unnamed: 0,WindSpeed_100_dwd,Temperature_avg,RelativeHumidity_avg,AirDensity,UsableWindPower_opt,WindSpeed_100_dwd_lag1,WindSpeed_100_dwd_lag2,WindSpeed_100_dwd_lag3,PowerOutput_opt,Wind_MWh_credit,residual,WindSpeed_100_dwd_std
0,10.956274,14.939209,85.005738,1.218670,2152.776404,11.528838,11.530592,11.395713,365.142806,493.948,128.805194,1.389925
1,10.767969,14.884831,85.305990,1.218900,2142.084818,10.956274,11.528838,11.530592,363.282470,476.255,112.972530,1.201620
2,10.579663,14.830452,85.606255,1.219131,2131.427254,10.767969,10.956274,11.528838,361.428053,462.122,100.693947,1.013314
3,10.239104,14.837072,85.139235,1.219136,2048.913885,10.579663,10.767969,10.956274,347.070727,428.380,81.309273,0.672755
4,9.898545,14.843692,84.672227,1.219141,1968.557550,10.239104,10.579663,10.767969,333.088725,379.238,46.149275,0.332196
...,...,...,...,...,...,...,...,...,...,...,...,...
57068,17.951534,5.133796,73.038472,1.265400,3290.000000,17.503826,17.238297,16.972767,563.019711,249.020,-313.999711,8.385185
57069,18.399240,5.093828,72.270605,1.265623,3290.000000,17.951534,17.503826,17.238297,563.019711,255.150,-307.869711,8.832891
57070,18.704720,5.077630,71.659860,1.265725,3290.000000,18.399240,17.951534,17.503826,563.019711,261.970,-301.049711,9.138371
57071,19.010200,5.061432,71.049115,1.265827,3290.000000,18.704720,18.399240,17.951534,563.019711,263.050,-299.969711,9.443851


In [100]:
df.to_csv("/Users/florian/Documents/github/DP2/Energy_production_price_prediction/Generation_forecast/Solar_forecast/data/wind_train.csv", index=False)