# Init

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from helper.eda import (
    null_summary,
    visualize_data
)
from helper.feature_engineering import (
    create_datetime_features,
    create_lagged_features,
    create_sincos_datetime_features,
    create_all_lagged_features
)
from helper.utils import (
    clean_column_names
)

In [2]:
# load datasets, set index and parse index as datetime

# 2023
df_2023_ancillary = pd.read_csv('data/raw/2023/Ancillary Volumes & Prices (4H).csv', parse_dates=True, sep=',', index_col=0, nrows=365*6)
df_2023_ancillary.index = pd.to_datetime(df_2023_ancillary.index, dayfirst=True)

df_2023_day_ahead_hourly = pd.read_csv('data/raw/2023/Day-Ahead Price (1H).csv', parse_dates=True, sep=',', index_col=0, nrows=365*24)
df_2023_day_ahead_hourly.index = pd.to_datetime(df_2023_day_ahead_hourly.index, dayfirst=True)

df_2023_day_ahead_half = pd.read_csv('data/raw/2023/Prices & Forecasts (HH).csv', parse_dates=True, sep=',', index_col=0, nrows=365*24*2)
df_2023_day_ahead_half.index = pd.to_datetime(df_2023_day_ahead_half.index, dayfirst=True)

# 2024
df_2024_ancillary = pd.read_csv('data/raw/2024/Ancillary Volumes & Prices (4H).csv', parse_dates=True, sep=',', index_col=0, nrows=365*6)
df_2024_ancillary.index = pd.to_datetime(df_2024_ancillary.index, dayfirst=True)

df_2024_day_ahead_hourly = pd.read_csv('data/raw/2024/Day-Ahead Price (1H).csv', parse_dates=True, sep=',', index_col=0, nrows=365*24)
df_2024_day_ahead_hourly.index = pd.to_datetime(df_2024_day_ahead_hourly.index, dayfirst=True)

df_2024_day_ahead_half = pd.read_csv('data/raw/2024/Prices & Forecasts (HH).csv', parse_dates=True, sep=',', index_col=0, nrows=365*24*2)
df_2024_day_ahead_half.index = pd.to_datetime(df_2024_day_ahead_half.index, dayfirst=True); df_2024_day_ahead_half

Unnamed: 0_level_0,National Demand Forecast (NDF) - GB (MW),"Day Ahead Price (EPEX half-hourly, local) - GB (£/MWh)"
GMT Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-01 00:00:00,22500,36.10
2024-01-01 00:30:00,23100,52.00
2024-01-01 01:00:00,22663,69.80
2024-01-01 01:30:00,21851,35.10
2024-01-01 02:00:00,20993,31.20
...,...,...
2024-12-30 21:30:00,27703,92.17
2024-12-30 22:00:00,26136,87.93
2024-12-30 22:30:00,24652,55.00
2024-12-30 23:00:00,23107,65.50


In [3]:
# concatenate datasets 
print("Ancillary Dataframes")
df_ancillary = pd.concat([df_2023_ancillary, df_2024_ancillary])
print(
    'df_2023_ancillary:', df_2023_ancillary.shape,
    '\ndf_2024_ancillary:', df_2024_ancillary.shape,
    '\ndf_ancillary:', df_ancillary.shape
)

print("\nDay Ahead Hourly Dataframes")
df_day_ahead_hourly = pd.concat([df_2023_day_ahead_hourly, df_2024_day_ahead_hourly])
print(
    'df_2023_day_ahead_hourly:', df_2023_day_ahead_hourly.shape,
    '\ndf_2024_day_ahead_hourly:', df_2024_day_ahead_hourly.shape,
    '\ndf_day_ahead_hourly:', df_day_ahead_hourly.shape
)

print("\nDay Ahead Half Hourly Dataframes")
df_day_ahead_half = pd.concat([df_2023_day_ahead_half, df_2024_day_ahead_half])
print(
    'df_2023_day_ahead_half:', df_2023_day_ahead_half.shape,
    '\ndf_2024_day_ahead_half:', df_2024_day_ahead_half.shape,
    '\ndf_day_ahead_half:', df_day_ahead_half.shape
)

del df_2023_ancillary, df_2024_ancillary, df_2023_day_ahead_hourly, df_2024_day_ahead_hourly, df_2023_day_ahead_half, df_2024_day_ahead_half

Ancillary Dataframes
df_2023_ancillary: (2190, 18) 
df_2024_ancillary: (2190, 18) 
df_ancillary: (4380, 18)

Day Ahead Hourly Dataframes
df_2023_day_ahead_hourly: (8760, 2) 
df_2024_day_ahead_hourly: (8760, 2) 
df_day_ahead_hourly: (17520, 2)

Day Ahead Half Hourly Dataframes
df_2023_day_ahead_half: (17520, 2) 
df_2024_day_ahead_half: (17520, 2) 
df_day_ahead_half: (35040, 2)


In [4]:
print(df_ancillary.info())
print(df_day_ahead_hourly.info())
print(df_day_ahead_half.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4380 entries, 2023-01-01 03:00:00 to 2024-12-30 23:00:00
Data columns (total 18 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Volume Requirements Forecast - DC-H - GB (MW)  4356 non-null   float64
 1   Volume Requirements Forecast - DC-L - GB (MW)  4356 non-null   float64
 2   Volume Requirements Forecast - DR-H - GB (MW)  4380 non-null   int64  
 3   Volume Requirements Forecast - DR-L - GB (MW)  4380 non-null   int64  
 4   Volume Requirements Forecast - DM-H - GB (MW)  4362 non-null   float64
 5   Volume Requirements Forecast - DM-L - GB (MW)  4380 non-null   int64  
 6   Ancillary Volume Accepted - DC-H - GB (MW)     4380 non-null   int64  
 7   Ancillary Volume Accepted - DC-L - GB (MW)     4380 non-null   int64  
 8   Ancillary Volume Accepted - DR-H - GB (MW)     4374 non-null   float64
 9   Ancillary Volume

# Preprocessing

## EDA

In [5]:
visualize_data(df_day_ahead_hourly, start_date='2023-01-01 01:00', end_date='2023-12-31 23:00', is_price=True)

In [5]:
visualize_data(df_ancillary, start_date='2023-10-01 01:00', end_date='2023-10-10 23:00', ln_y=False, columns=[
       'Volume Requirements Forecast - DC-H - GB (MW)',
       'Volume Requirements Forecast - DC-L - GB (MW)',
       # 'Volume Requirements Forecast - DR-H - GB (MW)',
       # 'Volume Requirements Forecast - DR-L - GB (MW)',
       # 'Volume Requirements Forecast - DM-H - GB (MW)',
       # 'Volume Requirements Forecast - DM-L - GB (MW)',
       'Ancillary Volume Accepted - DC-H - GB (MW)',
       'Ancillary Volume Accepted - DC-L - GB (MW)',
       # 'Ancillary Volume Accepted - DR-H - GB (MW)',
       # 'Ancillary Volume Accepted - DR-L - GB (MW)',
       # 'Ancillary Volume Accepted - DM-H - GB (MW)',
       # 'Ancillary Volume Accepted - DM-L - GB (MW)',
       'Ancillary Price - DC-H - GB (£/MW/h)',
       'Ancillary Price - DC-L - GB (£/MW/h)',
       # 'Ancillary Price - DR-H - GB (£/MW/h)',
       # 'Ancillary Price - DR-L - GB (£/MW/h)',
       # 'Ancillary Price - DM-H - GB (£/MW/h)',
       # 'Ancillary Price - DM-L - GB (£/MW/h)'
       ]
)

## Null values
Before EAC trend-break there were a lot of null values in DFR dataset (up to almost 10%). 

In [7]:
# EAC Introduction: 1st of November 2023
# A lot of null values before that date (try start_date='2023-01-01 00:00')
# After EAC there's virtually no nulls (try start_date='2023-11-01 00:00')
null_summary(df_ancillary, start_date='2023-11-01 00:00')

Unnamed: 0_level_0,Absolute Nulls,Relative Nulls (%)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
Volume Requirements Forecast - DC-H - GB (MW),18,0.7
Volume Requirements Forecast - DC-L - GB (MW),18,0.7
Volume Requirements Forecast - DR-H - GB (MW),0,0.0
Volume Requirements Forecast - DR-L - GB (MW),0,0.0
Volume Requirements Forecast - DM-H - GB (MW),0,0.0
Volume Requirements Forecast - DM-L - GB (MW),0,0.0
Ancillary Volume Accepted - DC-H - GB (MW),0,0.0
Ancillary Volume Accepted - DC-L - GB (MW),0,0.0
Ancillary Volume Accepted - DR-H - GB (MW),0,0.0
Ancillary Volume Accepted - DR-L - GB (MW),0,0.0


In [42]:
null_summary(df_day_ahead_hourly, start_date='2023-11-01 00:00')

Unnamed: 0_level_0,Absolute Nulls,Relative Nulls (%)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
"Day Ahead Price (N2EX, local) - GB (£/MWh)",1,0.01
"Day Ahead Price (EPEX, local) - GB (£/MWh)",0,0.0


In [44]:
null_summary(df_day_ahead_half, start_date='2023-11-01 00:00')

Unnamed: 0_level_0,Absolute Nulls,Relative Nulls (%)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
National Demand Forecast (NDF) - GB (MW),0,0.0
"Day Ahead Price (EPEX half-hourly, local) - GB (£/MWh)",0,0.0


## Model A

### Match 30-minute timeslots
I considered two options for aligning day-ahead market data with DFR 4-hour blocks: 
1. expanding DFR data to match half-hourly granularity. 
2. aggregating day-ahead data to match DFR blocks  

I chose to use both and model them seperately.

Model A: First option to preserve granular details, which are crucial for forecasting short-term prices in our small dataset. This approach helps maintain the integrity of highly granular signals essential for accurate forecasting. Aligning lowest granularity 30min day-ahead market data with DFR 4-Hour blocks by essentially copying the datapoints a number of times (x8 for DFR, x2 for hourly day-ahead). Also introduce a column to keep track of original datapoint.

Model B: This model uses aggregated data on a 4-hour block level to capture larger patterns and trends in the data. By aggregating, we can identify and leverage broader market movements and demand cycles that may not be apparent at a more granular level. This approach helps in understanding the overall dynamics of the market, which can be crucial for long-term forecasting and strategic decision-making. Aggregation can also smooth out short-term noise, providing a clearer picture of underlying trends that influence DFR prices.

In [6]:
df_ancillary_30 = pd.DataFrame(np.repeat(df_ancillary.values, repeats=8, axis=0), columns=df_ancillary.columns)
# Create the is_original column
is_original = np.tile([1] + [0]*7, len(df_ancillary))
df_ancillary_30['is_original'] = is_original
periods = 2*365*6*8 # 2 years of 4 hour daily blocks (6/day) with two 30min timeslots per hour
df_ancillary_30.index = pd.date_range(start='2023-01-01 03:00', periods=periods, freq='30min') # end='2024-12-31 02:30', 30mins before new year new timeslot
df_ancillary_30.value_counts('is_original')
df_ancillary_30

Unnamed: 0,Volume Requirements Forecast - DC-H - GB (MW),Volume Requirements Forecast - DC-L - GB (MW),Volume Requirements Forecast - DR-H - GB (MW),Volume Requirements Forecast - DR-L - GB (MW),Volume Requirements Forecast - DM-H - GB (MW),Volume Requirements Forecast - DM-L - GB (MW),Ancillary Volume Accepted - DC-H - GB (MW),Ancillary Volume Accepted - DC-L - GB (MW),Ancillary Volume Accepted - DR-H - GB (MW),Ancillary Volume Accepted - DR-L - GB (MW),Ancillary Volume Accepted - DM-H - GB (MW),Ancillary Volume Accepted - DM-L - GB (MW),Ancillary Price - DC-H - GB (£/MW/h),Ancillary Price - DC-L - GB (£/MW/h),Ancillary Price - DR-H - GB (£/MW/h),Ancillary Price - DR-L - GB (£/MW/h),Ancillary Price - DM-H - GB (£/MW/h),Ancillary Price - DM-L - GB (£/MW/h),is_original
2023-01-01 03:00:00,839.0,687.0,80.0,80.0,80.0,80.0,689.0,818.0,84.0,12.0,52.0,,1.76,3.5,0.00,3.00,6.50,,1
2023-01-01 03:30:00,839.0,687.0,80.0,80.0,80.0,80.0,689.0,818.0,84.0,12.0,52.0,,1.76,3.5,0.00,3.00,6.50,,0
2023-01-01 04:00:00,839.0,687.0,80.0,80.0,80.0,80.0,689.0,818.0,84.0,12.0,52.0,,1.76,3.5,0.00,3.00,6.50,,0
2023-01-01 04:30:00,839.0,687.0,80.0,80.0,80.0,80.0,689.0,818.0,84.0,12.0,52.0,,1.76,3.5,0.00,3.00,6.50,,0
2023-01-01 05:00:00,839.0,687.0,80.0,80.0,80.0,80.0,689.0,818.0,84.0,12.0,52.0,,1.76,3.5,0.00,3.00,6.50,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 00:30:00,1245.0,1317.0,330.0,330.0,200.0,170.0,1319.0,1289.0,350.0,308.0,200.0,169.0,1.75,4.0,-2.05,10.25,0.45,5.22,0
2024-12-31 01:00:00,1245.0,1317.0,330.0,330.0,200.0,170.0,1319.0,1289.0,350.0,308.0,200.0,169.0,1.75,4.0,-2.05,10.25,0.45,5.22,0
2024-12-31 01:30:00,1245.0,1317.0,330.0,330.0,200.0,170.0,1319.0,1289.0,350.0,308.0,200.0,169.0,1.75,4.0,-2.05,10.25,0.45,5.22,0
2024-12-31 02:00:00,1245.0,1317.0,330.0,330.0,200.0,170.0,1319.0,1289.0,350.0,308.0,200.0,169.0,1.75,4.0,-2.05,10.25,0.45,5.22,0


In [7]:
df_day_ahead_hourly_30 = pd.DataFrame(np.repeat(df_day_ahead_hourly.values, repeats=2, axis=0), columns=df_day_ahead_hourly.columns)
# Create the is_original column
is_original = np.tile([1] + [0], len(df_day_ahead_hourly))
df_day_ahead_hourly_30['is_original'] = is_original
periods=2*365*24*2 # 2 years of 24 hour daily blocks with two 30min timeslots per hour
df_day_ahead_hourly_30.index = pd.date_range(start='2023-01-01 00:00', periods=periods, freq='30min') # end='2024-12-3O 23:30', 30mins before new year new timeslot
df_day_ahead_hourly_30.value_counts('is_original')
df_day_ahead_hourly_30

Unnamed: 0,"Day Ahead Price (N2EX, local) - GB (£/MWh)","Day Ahead Price (EPEX, local) - GB (£/MWh)",is_original
2023-01-01 00:00:00,13.01,60.0,1
2023-01-01 00:30:00,13.01,60.0,0
2023-01-01 01:00:00,25.05,49.5,1
2023-01-01 01:30:00,25.05,49.5,0
2023-01-01 02:00:00,0.99,18.0,1
...,...,...,...
2024-12-30 21:30:00,106.43,98.5,0
2024-12-30 22:00:00,90.40,84.5,1
2024-12-30 22:30:00,90.40,84.5,0
2024-12-30 23:00:00,72.00,62.7,1


In [8]:
df_day_ahead_half['is_original'] = 1
df_day_ahead_half.value_counts('is_original')

is_original
1    35040
Name: count, dtype: int64

In [9]:
df_ancillary_30.drop(columns=['is_original'], inplace=True)
df_day_ahead_hourly_30.drop(columns=['is_original'], inplace=True)
df_day_ahead_half.drop(columns=['is_original'], inplace=True)

### Subset for full-days in ancillary dataset (WRONG PLACE?)

In [10]:
# we now have all three datasets on a 30minute format, but we'll have to make the datetime indices match now
# the ancillary dataset is the smallest (least amount of rows) so we'll subset the other two datasets to match it
# we'll make use of the "day" (day starts at 23:00 on t-1 and ends at 23:00 on t) syntax of DFR markets
print("Amount of full days in set: ",df_ancillary_30.loc['2023-01-01 23:00':'2024-12-30 22:30'].shape[0]/8/6)
df_ancillary_resampled = df_ancillary_30.loc['2023-01-01 23:00':'2024-12-30 22:30']
# there's 729 full "day"s in the ancillary dataset, so we'll subset the other two datasets to match this
df_day_ahead_hourly_resampled = df_day_ahead_hourly_30.loc['2023-01-01 23:00':'2024-12-30 22:30']
df_day_ahead_half_resampled = df_day_ahead_half.loc['2023-01-01 23:00':'2024-12-30 22:30']

print(
    'df_ancillary_resampled:', df_ancillary_resampled.shape,
    '\ndf_day_ahead_hourly_resampled:', df_day_ahead_hourly_resampled.shape,
    '\ndf_day_ahead_half_resampled:', df_day_ahead_half_resampled.shape
)

Amount of full days in set:  729.0
df_ancillary_resampled: (34992, 18) 
df_day_ahead_hourly_resampled: (34992, 2) 
df_day_ahead_half_resampled: (34992, 2)


### Merge three datasets (WRONG PLACE?)

In [11]:
# we'll now merge the three datasets by the datetime index
df_merged_A = pd.concat([df_ancillary_resampled, df_day_ahead_hourly_resampled, df_day_ahead_half_resampled], axis=1)
df_merged_A.shape

del df_ancillary_resampled, df_day_ahead_hourly_resampled, df_day_ahead_half_resampled

## Model B

### Match 4-hour EFA blocks

To do:
- Subset all datasets for all full "days" (days by definition of DFR markets), i.e. 11PM d-1 to 11PM d.
- Day ahead hourly
    - Include the 4 hourly values for that EFA block as features (groupby EFA block and don't summarize but include as features)
    - Summarization of full day, min/max/mean/std (groupby day and don't summarize but include as features)
- Day ahead half-hourly
    - Lagged: Include the 8 half-hourly features for that EFA block as features
    - Lagged: Summarization of full day, min/max/mean/std (idk if this makes sense here because lagged)

In [45]:
# get the number of full "days" in ancillary
full_days = df_ancillary.loc['2023-01-01 23:00':'2024-12-30 19:00'].shape[0]/6
print("Number of full days in dataset: ", full_days)

# subset the data to only include the full "days"
df_ancillary_resampled = df_ancillary.loc['2023-01-01 23:00':'2024-12-30 19:00']
df_day_ahead_hourly_resampled = df_day_ahead_hourly.loc['2023-01-01 23:00':'2024-12-30 22:00'] # 4 datapoints per EFA block
df_day_ahead_half_resampled = df_day_ahead_half.loc['2023-01-01 23:00':'2024-12-30 22:30'] # 8 datapoints per EFA block

print(
    'df_ancillary_resampled:', df_ancillary_resampled.shape,
    '\ndf_day_ahead_hourly_resampled:', df_day_ahead_hourly_resampled.shape,
    '\ndf_day_ahead_half_resampled:', df_day_ahead_half_resampled.shape
)

Number of full days in dataset:  729.0
df_ancillary_resampled: (4374, 18) 
df_day_ahead_hourly_resampled: (17496, 2) 
df_day_ahead_half_resampled: (34992, 2)


# Feature Engineering
Techniques included

| Technique              | Why                                                |
|------------------------|-----------------------------------------------------------------------|
| Datetime Features      | Incorporates temporal trends like hour of day, day of week, etc.     |
| Sinusoid Transform     | Models seasonal patterns and cyclical effects in the data.           |
| Lagged Features        | Accounts for temporal dependencies and past influences on current data.|
| Summarization (Min, Max, Mean, Std) | Provides statistical insights into data distribution and variability. |
| Holidays               | Captures the impact of holidays on energy consumption patterns.      |


In [11]:
df_merged_A.shape

(34992, 22)

## Model A

### Datetime features
This function extracts and adds temporal features to a DataFrame, including normalized year, week of the month, day of the week, hour of the day, half-hour of the day, and EFA block, assuming the DataFrame index is a DatetimeIndex.

In [12]:
date_df = create_datetime_features(df_merged_A)
date_df

Unnamed: 0,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end,hour_of_day,halfhour_of_day,efa_block
2023-01-01 23:00:00,1,52,1,0,1,6,1,23,46,1
2023-01-01 23:30:00,1,52,1,0,1,6,1,23,47,1
2023-01-02 00:00:00,2,1,1,0,1,0,0,0,0,1
2023-01-02 00:30:00,2,1,1,0,1,0,0,0,1,1
2023-01-02 01:00:00,2,1,1,0,1,0,0,1,2,1
...,...,...,...,...,...,...,...,...,...,...
2024-12-30 20:30:00,30,1,12,1,5,0,0,20,41,6
2024-12-30 21:00:00,30,1,12,1,5,0,0,21,42,6
2024-12-30 21:30:00,30,1,12,1,5,0,0,21,43,6
2024-12-30 22:00:00,30,1,12,1,5,0,0,22,44,6


### Sinusoid

In [13]:
sinus_date_df = create_sincos_datetime_features(date_df)
sinus_date_df

Unnamed: 0,tm_d,tm_w,tm_m,tm_y,tm_w_end,sin_tm_wm,cos_tm_wm,sin_tm_dw,cos_tm_dw,sin_hour_of_day,cos_hour_of_day,sin_halfhour_of_day,cos_halfhour_of_day,sin_efa_block,cos_efa_block
2023-01-01 23:00:00,1,52,1,0,1,9.510565e-01,0.309017,-2.449294e-16,1.0,-2.449294e-16,1.000000,-1.332870e-01,0.991077,8.660254e-01,0.5
2023-01-01 23:30:00,1,52,1,0,1,9.510565e-01,0.309017,-2.449294e-16,1.0,-2.449294e-16,1.000000,6.432491e-16,1.000000,8.660254e-01,0.5
2023-01-02 00:00:00,2,1,1,0,0,9.510565e-01,0.309017,0.000000e+00,1.0,0.000000e+00,1.000000,0.000000e+00,1.000000,8.660254e-01,0.5
2023-01-02 00:30:00,2,1,1,0,0,9.510565e-01,0.309017,0.000000e+00,1.0,0.000000e+00,1.000000,1.332870e-01,0.991077,8.660254e-01,0.5
2023-01-02 01:00:00,2,1,1,0,0,9.510565e-01,0.309017,0.000000e+00,1.0,2.697968e-01,0.962917,2.641954e-01,0.964469,8.660254e-01,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-30 20:30:00,30,1,12,1,0,-2.449294e-16,1.000000,0.000000e+00,1.0,-7.308360e-01,0.682553,-7.188237e-01,0.695192,-2.449294e-16,1.0
2024-12-30 21:00:00,30,1,12,1,0,-2.449294e-16,1.000000,0.000000e+00,1.0,-5.195840e-01,0.854419,-6.197499e-01,0.784799,-2.449294e-16,1.0
2024-12-30 21:30:00,30,1,12,1,0,-2.449294e-16,1.000000,0.000000e+00,1.0,-5.195840e-01,0.854419,-5.096166e-01,0.860402,-2.449294e-16,1.0
2024-12-30 22:00:00,30,1,12,1,0,-2.449294e-16,1.000000,0.000000e+00,1.0,-2.697968e-01,0.962917,-3.903893e-01,0.920650,-2.449294e-16,1.0


### Lagged features
Idea is to have 2 features with both day ahead markets, and 7 features of DFR markets (to put emphasis on DFR) -> Overfit?
- 2D half hourly (half hourly = all lagged)
- 1D hourly (hourly = 1 regular, 1 lagged)
- 7D ancillary prices / volumes 

In [14]:
# Create 2D lagged features for day ahead hourly prices, drop target because lagged
lag_half_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(2*48)+1, 48)], target='Day Ahead Price (EPEX half-hourly, local) - GB (£/MWh)', drop_target=True)

# Create 7D lagged features for ancillary prices, drop target because lagged
lag_DCH_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Price - DC-H - GB (£/MW/h)', drop_target=True)
lag_DCL_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Price - DC-L - GB (£/MW/h)', drop_target=True)
lag_DMH_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Price - DM-H - GB (£/MW/h)', drop_target=True)
lag_DML_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Price - DM-L - GB (£/MW/h)', drop_target=True)
lag_DRH_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Price - DR-H - GB (£/MW/h)', drop_target=True)
lag_DRL_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Price - DR-L - GB (£/MW/h)', drop_target=True)

# Create 7D lagged features for ancillary volumes, drop target because lagged
lag_DCH_volume_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Volume Accepted - DC-H - GB (MW)', drop_target=True)
lag_DCL_volume_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Volume Accepted - DC-L - GB (MW)', drop_target=True)
lag_DMH_volume_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Volume Accepted - DM-H - GB (MW)', drop_target=True)
lag_DML_volume_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Volume Accepted - DM-L - GB (MW)', drop_target=True)
lag_DRH_volume_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Volume Accepted - DR-H - GB (MW)', drop_target=True)
lag_DRL_volume_df = create_lagged_features(df_merged_A, lag_days=[col for col in range(48,(7*48)+1, 48)], target='Ancillary Volume Accepted - DR-L - GB (MW)', drop_target=True)

# Create 1D lagged feature for day ahead hourly, don't drop target because regular
lag_hourly_n2_df = create_lagged_features(df_merged_A, lag_days=[48], target='Day Ahead Price (N2EX, local) - GB (£/MWh)', drop_target=False)
lag_hourly_epex_df = create_lagged_features(df_merged_A, lag_days=[48], target='Day Ahead Price (EPEX, local) - GB (£/MWh)', drop_target=False)

# Merge dataframes with lagged features on datetime index and drop all uncessary dataframes
df_merged_A_lag = pd.concat([lag_half_df, lag_DCH_df, lag_DCL_df, lag_DMH_df, lag_DML_df, lag_DRH_df, lag_DRL_df, lag_DCH_volume_df, lag_DCL_volume_df, lag_DMH_volume_df, lag_DML_volume_df, lag_DRH_volume_df, lag_DRL_volume_df, lag_hourly_n2_df, lag_hourly_epex_df], axis=1)

del lag_half_df, lag_DCH_df, lag_DCL_df, lag_DMH_df, lag_DML_df, lag_DRH_df, lag_DRL_df, lag_DCH_volume_df, lag_DCL_volume_df, lag_DMH_volume_df, lag_DML_volume_df, lag_DRH_volume_df, lag_DRL_volume_df, lag_hourly_n2_df, lag_hourly_epex_df

df_merged_A_lag

Unnamed: 0,"Day Ahead Price (EPEX half-hourly, local) - GB (£/MWh)_lag_48","Day Ahead Price (EPEX half-hourly, local) - GB (£/MWh)_lag_96",Ancillary Price - DC-H - GB (£/MW/h)_lag_48,Ancillary Price - DC-H - GB (£/MW/h)_lag_96,Ancillary Price - DC-H - GB (£/MW/h)_lag_144,Ancillary Price - DC-H - GB (£/MW/h)_lag_192,Ancillary Price - DC-H - GB (£/MW/h)_lag_240,Ancillary Price - DC-H - GB (£/MW/h)_lag_288,Ancillary Price - DC-H - GB (£/MW/h)_lag_336,Ancillary Price - DC-L - GB (£/MW/h)_lag_48,...,Ancillary Volume Accepted - DR-L - GB (MW)_lag_96,Ancillary Volume Accepted - DR-L - GB (MW)_lag_144,Ancillary Volume Accepted - DR-L - GB (MW)_lag_192,Ancillary Volume Accepted - DR-L - GB (MW)_lag_240,Ancillary Volume Accepted - DR-L - GB (MW)_lag_288,Ancillary Volume Accepted - DR-L - GB (MW)_lag_336,"Day Ahead Price (N2EX, local) - GB (£/MWh)","Day Ahead Price (N2EX, local) - GB (£/MWh)_lag_48","Day Ahead Price (EPEX, local) - GB (£/MWh)","Day Ahead Price (EPEX, local) - GB (£/MWh)_lag_48"
2023-01-01 23:00:00,,,,,,,,,,,...,,,,,,,150.03,,150.0,
2023-01-01 23:30:00,,,,,,,,,,,...,,,,,,,150.03,,150.0,
2023-01-02 00:00:00,,,,,,,,,,,...,,,,,,,185.25,,145.1,
2023-01-02 00:30:00,,,,,,,,,,,...,,,,,,,185.25,,145.1,
2023-01-02 01:00:00,,,,,,,,,,,...,,,,,,,192.79,,146.1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-30 20:30:00,97.44,105.00,2.14,1.89,1.11,1.69,1.0,1.0,1.0,3.36,...,330.0,330.0,330.0,330.0,330.0,330.0,107.04,97.22,97.1,94.0
2024-12-30 21:00:00,106.10,96.00,2.14,1.89,1.11,1.69,1.0,1.0,1.0,3.36,...,330.0,330.0,330.0,330.0,330.0,330.0,106.43,91.96,98.5,89.5
2024-12-30 21:30:00,91.96,91.00,2.14,1.89,1.11,1.69,1.0,1.0,1.0,3.36,...,330.0,330.0,330.0,330.0,330.0,330.0,106.43,91.96,98.5,89.5
2024-12-30 22:00:00,99.00,92.70,2.14,1.89,1.11,1.69,1.0,1.0,1.0,3.36,...,330.0,330.0,330.0,330.0,330.0,330.0,90.40,77.70,84.5,78.1


### Summarization
Min, Max, Mean, Std

In [14]:
"To Do"

'To Do'

### Holidays

In [15]:
"To Do"

'To Do'

### Merging all FE datasets 
Starting with DC-L, market with highest volume 

In [15]:
# Merge all the FE datasets
X = pd.concat([sinus_date_df, df_merged_A_lag], axis=1)
y = df_merged_A["Ancillary Price - DC-L - GB (£/MW/h)"]

print(X.shape, y.shape)

(34992, 105) (34992,)


## Model B

Unnamed: 0_level_0,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end,hour_of_day,halfhour_of_day,efa_block
GMT Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-01-01 23:00:00,1,52,1,0,1,6,1,23,46,1
2023-01-02 00:00:00,2,1,1,0,1,0,0,0,0,1
2023-01-02 01:00:00,2,1,1,0,1,0,0,1,2,1
2023-01-02 02:00:00,2,1,1,0,1,0,0,2,4,1
2023-01-02 03:00:00,2,1,1,0,1,0,0,3,6,2
2023-01-02 04:00:00,2,1,1,0,1,0,0,4,8,2
2023-01-02 05:00:00,2,1,1,0,1,0,0,5,10,2
2023-01-02 06:00:00,2,1,1,0,1,0,0,6,12,2
2023-01-02 07:00:00,2,1,1,0,1,0,0,7,14,3
2023-01-02 08:00:00,2,1,1,0,1,0,0,8,16,3


# Train

## Model A
You need to be wary of data leakage when working with lagged features, you can't split into train/test set after you've already made the lagged features

In [16]:
train = df_merged_A.loc[:'2024-10-30 22:30']
test = df_merged_A.loc['2024-10-30 23:00':]

# Create features for train
train_date = create_datetime_features(train)
train_sinus_date = create_sincos_datetime_features(train_date)
train_lag = create_all_lagged_features(train)
train_X = pd.concat([train_sinus_date, train_lag], axis=1)

# Create features for test
test_date = create_datetime_features(test)
test_sinus_date = create_sincos_datetime_features(test_date)
test_lag = create_all_lagged_features(test)
test_X = pd.concat([test_sinus_date, test_lag], axis=1)

In [23]:
train_X = clean_column_names(train_X)
test_X = clean_column_names(test_X)

In [34]:
# Define the target column
TARGET = "Ancillary Price - DC-L - GB (£/MW/h)"

# Split the data into train and test sets, 2 last month's = TEST
train = df_merged_A.loc[:'2024-10-30 22:30']
test = df_merged_A.loc['2024-10-30 23:00':]

# Create features for train
train_date = create_datetime_features(train)
train_sinus_date = create_sincos_datetime_features(train_date)
train_lag = create_all_lagged_features(train)
train_X = pd.concat([train_sinus_date, train_lag], axis=1)
train_y = train[TARGET]

train_X = clean_column_names(train_X)

# Create features for test
test_date = create_datetime_features(test)
test_sinus_date = create_sincos_datetime_features(test_date)
test_lag = create_all_lagged_features(test)
test_X = pd.concat([test_sinus_date, test_lag], axis=1)
test_y = test[TARGET]

test_X = clean_column_names(test_X)

# Use TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=8)

# Define the LightGBM model with L1 regularization (Lasso)
model = lgb.LGBMRegressor(lambda_l1=1.0)  # L1 regularization

# Train the model using cross-validation
for train_index, val_index in tscv.split(train_X):
    X_train, X_val = train_X.iloc[train_index], train_X.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]

    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='mean_absolute_percentage_error')

# Evaluate the model on the test set
predictions = model.predict(test_X)
mse = mean_squared_error(test_y, predictions)
print(f"Mean Squared Error on Test Set: {mse}")

# Example usage
print(predictions)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14958
[LightGBM] [Info] Number of data points in the train set: 3568, number of used features: 104
[LightGBM] [Info] Start training from score 4.251054
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008534 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17729
[LightGBM] [Info] Number of data points in the train set: 7130, number of used features: 104
[LightGBM] [Info] Start training from score 3.039057
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

In [35]:
# Merge predictions and test_y into a single dataframe
results = pd.DataFrame({"predictions": predictions, "test_y": test_y})
visualize_data(results, is_price=True)