In [1]:
import numpy as np
import datetime as dt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
!pip3 install graycode
import graycode



# Day ahead prices data

In [3]:
price_2015 = pd.read_csv('data/Day-ahead Prices_201501010000-201601010000.csv') 
price_2015 = price_2015.iloc[96:] # data of january 1 to 4 is missing so remove them
price_2016 = pd.read_csv('data/Day-ahead Prices_201601010000-201701010000.csv') 
price_2017 = pd.read_csv('data/Day-ahead Prices_201701010000-201801010000.csv') 
price_2018 = pd.read_csv('data/Day-ahead Prices_201801010000-201901010000.csv') 
price_2019 = pd.read_csv('data/Day-ahead Prices_201901010000-202001010000.csv') 
price_2020 = pd.read_csv('data/Day-ahead Prices_202001010000-202101010000.csv') 

In [4]:
# create 1 day ahead prices by combining all the years
day_ahead = pd.concat([price_2015, price_2016, price_2017, price_2018, price_2019, price_2020]).reset_index(drop=True)

In [5]:
day_ahead

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh]
0,05.01.2015 00:00 - 05.01.2015 01:00,36.56
1,05.01.2015 01:00 - 05.01.2015 02:00,34.94
2,05.01.2015 02:00 - 05.01.2015 03:00,32.19
3,05.01.2015 03:00 - 05.01.2015 04:00,28.05
4,05.01.2015 04:00 - 05.01.2015 05:00,28.04
...,...,...
52513,31.12.2020 19:00 - 31.12.2020 20:00,61.51
52514,31.12.2020 20:00 - 31.12.2020 21:00,56.79
52515,31.12.2020 21:00 - 31.12.2020 22:00,52.44
52516,31.12.2020 22:00 - 31.12.2020 23:00,51.86


In [6]:
day_ahead[day_ahead.isnull().any(axis=1)]

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh]
1994,29.03.2015 02:00 - 29.03.2015 03:00,
10731,27.03.2016 02:00 - 27.03.2016 03:00,
19468,26.03.2017 02:00 - 26.03.2017 03:00,
28205,25.03.2018 02:00 - 25.03.2018 03:00,
37110,31.03.2019 02:00 - 31.03.2019 03:00,
45847,29.03.2020 02:00 - 29.03.2020 03:00,


In [7]:
# interpolate missing day ahead price, for now using polynomial interpolation with degree 2
day_ahead['Day-ahead Price [EUR/MWh]'] = day_ahead['Day-ahead Price [EUR/MWh]'].interpolate(method='polynomial', order=2)

In [8]:
day_ahead[day_ahead['MTU (CET)'].map(day_ahead['MTU (CET)'].value_counts()) == 2]

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh]
7034,25.10.2015 02:00 - 25.10.2015 03:00,25.07
7035,25.10.2015 02:00 - 25.10.2015 03:00,25.02
15939,30.10.2016 02:00 - 30.10.2016 03:00,38.62
15940,30.10.2016 02:00 - 30.10.2016 03:00,38.05
24676,29.10.2017 02:00 - 29.10.2017 03:00,3.27
24677,29.10.2017 02:00 - 29.10.2017 03:00,-13.1
33413,28.10.2018 02:00 - 28.10.2018 03:00,55.48
33414,28.10.2018 02:00 - 28.10.2018 03:00,51.8
42150,27.10.2019 02:00 - 27.10.2019 03:00,14.25
42151,27.10.2019 02:00 - 27.10.2019 03:00,25.81


In [9]:
# if time appears more than once (because of time zone change), only keep the maximum 
day_ahead = day_ahead.groupby('MTU (CET)')['Day-ahead Price [EUR/MWh]'].max().reset_index()

In [10]:
day_ahead['datetime'] = day_ahead['MTU (CET)'].str.split('-').str[0]
day_ahead['datetime'] = pd.to_datetime(day_ahead['datetime'], format="%d.%m.%Y %H:%M ")

In [11]:
day_ahead

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh],datetime
0,01.01.2016 00:00 - 01.01.2016 01:00,23.86,2016-01-01 00:00:00
1,01.01.2016 01:00 - 01.01.2016 02:00,22.39,2016-01-01 01:00:00
2,01.01.2016 02:00 - 01.01.2016 03:00,20.59,2016-01-01 02:00:00
3,01.01.2016 03:00 - 01.01.2016 04:00,16.81,2016-01-01 03:00:00
4,01.01.2016 04:00 - 01.01.2016 05:00,17.41,2016-01-01 04:00:00
...,...,...,...
52507,31.12.2020 19:00 - 31.12.2020 20:00,61.51,2020-12-31 19:00:00
52508,31.12.2020 20:00 - 31.12.2020 21:00,56.79,2020-12-31 20:00:00
52509,31.12.2020 21:00 - 31.12.2020 22:00,52.44,2020-12-31 21:00:00
52510,31.12.2020 22:00 - 31.12.2020 23:00,51.86,2020-12-31 22:00:00


In [12]:
day_ahead[day_ahead.isnull().any(axis=1)]

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh],datetime


# Weather data

In [13]:
weather = pd.read_json('data/parsed_weather.json')

In [14]:
# remove data outside of january 5 2015 to december 31 2020
weather['date'] = pd.to_datetime(weather['date'])
start_date = '2015-1-4'
end_date = '2020-12-31'
mask = (weather['date'] > start_date) & (weather['date'] <= end_date)
weather = weather.loc[mask]
weather['datetime'] = pd.to_datetime(weather['date'] + pd.to_timedelta(weather['time']/100, unit='H'))
weather = weather.replace({'time': {0: 2400}}) # convert 0 to 2400, for later convenience

In [15]:
# weather description to string from list
weather['weatherDesc'] = [', '.join(map(str, l)) for l in weather['weatherDesc']]

In [16]:
# converting categorical variables to numerical

# option 1: label encoding

# wind direction
winddir_encoder = LabelEncoder()
winddir_encoder.fit(weather['winddir16Point'])
weather['winddir16Point'] = winddir_encoder.transform(weather.winddir16Point)
#wind description
winddesc_encoder = LabelEncoder()
winddesc_encoder.fit(weather['weatherDesc'])
weather['weatherDesc'] = winddesc_encoder.transform(weather.weatherDesc)


# option 2: one-hot encoding
'''
one_hot_winddir = pd.get_dummies(weather['winddir16Point'], prefix='winddir_')
weather = weather.join(one_hot_winddir)
one_hot_weatherDesc = pd.get_dummies(weather['weatherDesc'], prefix='weatherDesc_')
weather = weather.join(one_hot_weatherDesc)
weather = weather.drop(['winddir16Point', 'weatherDesc'], axis=1) 
'''

"\none_hot_winddir = pd.get_dummies(weather['winddir16Point'], prefix='winddir_')\nweather = weather.join(one_hot_winddir)\none_hot_weatherDesc = pd.get_dummies(weather['weatherDesc'], prefix='weatherDesc_')\nweather = weather.join(one_hot_weatherDesc)\nweather = weather.drop(['winddir16Point', 'weatherDesc'], axis=1) \n"

In [17]:
weather

Unnamed: 0,date,time,tempC,windspeedKmph,winddirDegree,winddir16Point,weatherDesc,precipMM,humidity,pressure,HeatIndexC,FeelsLikeC,datetime
96,2015-01-05,2400,0,12,203,11,31,0.0,96,1037,0,-4,2015-01-05 00:00:00
97,2015-01-05,100,0,13,206,11,31,0.0,96,1036,0,-4,2015-01-05 01:00:00
98,2015-01-05,200,0,14,209,11,31,0.0,96,1036,0,-4,2015-01-05 02:00:00
99,2015-01-05,300,0,15,212,11,31,0.0,96,1035,0,-5,2015-01-05 03:00:00
100,2015-01-05,400,0,14,207,11,31,0.0,96,1035,0,-5,2015-01-05 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52603,2020-12-31,1900,4,10,217,12,14,0.3,94,1000,4,2,2020-12-31 19:00:00
52604,2020-12-31,2000,4,12,225,12,14,0.2,93,1000,4,1,2020-12-31 20:00:00
52605,2020-12-31,2100,4,14,234,12,14,0.3,93,1001,4,0,2020-12-31 21:00:00
52606,2020-12-31,2200,3,12,269,13,14,0.2,92,1003,3,0,2020-12-31 22:00:00


# Merge two dataset

In [18]:
total = pd.merge(day_ahead, weather, how='outer', on='datetime')

In [19]:
# set datetime as index
total = total.set_index('datetime')
total = total.sort_index()

In [20]:
# check if there are any missing data
pd.date_range("1.5.2015 00:00", "12.31.2020 11:00", freq="60min").difference(total.index)

DatetimeIndex([], dtype='datetime64[ns]', freq=None)

In [21]:
total

Unnamed: 0_level_0,MTU (CET),Day-ahead Price [EUR/MWh],date,time,tempC,windspeedKmph,winddirDegree,winddir16Point,weatherDesc,precipMM,humidity,pressure,HeatIndexC,FeelsLikeC
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015-01-05 00:00:00,05.01.2015 00:00 - 05.01.2015 01:00,36.56,2015-01-05,2400,0,12,203,11,31,0.0,96,1037,0,-4
2015-01-05 01:00:00,05.01.2015 01:00 - 05.01.2015 02:00,34.94,2015-01-05,100,0,13,206,11,31,0.0,96,1036,0,-4
2015-01-05 02:00:00,05.01.2015 02:00 - 05.01.2015 03:00,32.19,2015-01-05,200,0,14,209,11,31,0.0,96,1036,0,-4
2015-01-05 03:00:00,05.01.2015 03:00 - 05.01.2015 04:00,28.05,2015-01-05,300,0,15,212,11,31,0.0,96,1035,0,-5
2015-01-05 04:00:00,05.01.2015 04:00 - 05.01.2015 05:00,28.04,2015-01-05,400,0,14,207,11,31,0.0,96,1035,0,-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 19:00:00,31.12.2020 19:00 - 31.12.2020 20:00,61.51,2020-12-31,1900,4,10,217,12,14,0.3,94,1000,4,2
2020-12-31 20:00:00,31.12.2020 20:00 - 31.12.2020 21:00,56.79,2020-12-31,2000,4,12,225,12,14,0.2,93,1000,4,1
2020-12-31 21:00:00,31.12.2020 21:00 - 31.12.2020 22:00,52.44,2020-12-31,2100,4,14,234,12,14,0.3,93,1001,4,0
2020-12-31 22:00:00,31.12.2020 22:00 - 31.12.2020 23:00,51.86,2020-12-31,2200,3,12,269,13,14,0.2,92,1003,3,0


# Time variable

In [22]:
time = (total['time'].values/100).astype(int)

In [23]:
# incremental representation of time
time_increment = time/10

In [24]:
# gray code binary
time_gray_code = np.empty([len(time), 5])
for i in range(len(time)):
    gray_code_str = '{:05b}'.format(graycode.tc_to_gray_code(time[i]))
    time_gray_code[i] = np.array(list(gray_code_str)).astype(np.int8)

In [25]:
# mutually exclusive binary representation
time_exclusive = np.zeros([len(time), 24])
for i in range(len(time)):
    time_exclusive[i][time[i] - 1] = 1
time_exclusive = time_exclusive[:,::-1] # reverse array to correspond to binary representation

In [26]:
print(time_increment)
print(time_gray_code)
print(time_exclusive)

[2.4 0.1 0.2 ... 2.1 2.2 2.3]
[[1. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 1.]
 ...
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 0. 1.]
 [1. 1. 1. 0. 0.]]
[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


## for now, use time increment (change later to check performance)

In [27]:
total.corr()

Unnamed: 0,Day-ahead Price [EUR/MWh],time,tempC,windspeedKmph,winddirDegree,winddir16Point,weatherDesc,precipMM,humidity,pressure,HeatIndexC,FeelsLikeC
Day-ahead Price [EUR/MWh],1.0,0.174005,-0.133195,-0.18845,-0.022869,-0.042958,-0.008397,-0.022955,0.031495,0.103419,-0.129315,-0.117097
time,0.174005,1.0,0.095356,-0.01475,0.026469,-0.021589,-0.056038,0.03837,-0.13334,0.000212,0.099139,0.095338
tempC,-0.133195,0.095356,1.0,-0.127865,0.023588,-0.012606,0.154066,0.004941,-0.479239,-0.029382,0.997313,0.991161
windspeedKmph,-0.18845,-0.01475,-0.127865,1.0,0.154376,0.259241,-0.005762,0.203419,0.011223,-0.372671,-0.130051,-0.2156
winddirDegree,-0.022869,0.026469,0.023588,0.154376,1.0,0.680413,-0.002654,0.102686,0.133882,-0.125636,0.020114,0.013143
winddir16Point,-0.042958,-0.021589,-0.012606,0.259241,0.680413,1.0,-0.005816,0.103384,0.192425,-0.18167,-0.015225,-0.032094
weatherDesc,-0.008397,-0.056038,0.154066,-0.005762,-0.002654,-0.005816,1.0,-0.020371,-0.275969,0.004062,0.15681,0.149337
precipMM,-0.022955,0.03837,0.004941,0.203419,0.102686,0.103384,-0.020371,1.0,0.181846,-0.30361,0.00549,-0.012759
humidity,0.031495,-0.13334,-0.479239,0.011223,0.133882,0.192425,-0.275969,0.181846,1.0,-0.121627,-0.48349,-0.46149
pressure,0.103419,0.000212,-0.029382,-0.372671,-0.125636,-0.18167,0.004062,-0.30361,-0.121627,1.0,-0.029405,0.008213


In [28]:
total['time_increment'] = time_increment
#time_ex = pd.DataFrame(data=time_exclusive, index=total.index)
#total = total.join(time_ex)

In [29]:
# remove unnecessary columns
total = total.drop(columns=['MTU (CET)', 'date', 'time'])
# remove wind direction, since we have the wind degree, remove weather description since it is too long/too many
total = total.drop(columns=['winddir16Point', 'weatherDesc'])
# remove head index and feels like temperature because correlation between temperature is >0.99
total = total.drop(columns=['HeatIndexC', 'FeelsLikeC'])

In [30]:
total

Unnamed: 0_level_0,Day-ahead Price [EUR/MWh],tempC,windspeedKmph,winddirDegree,precipMM,humidity,pressure,time_increment
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-05 00:00:00,36.56,0,12,203,0.0,96,1037,2.4
2015-01-05 01:00:00,34.94,0,13,206,0.0,96,1036,0.1
2015-01-05 02:00:00,32.19,0,14,209,0.0,96,1036,0.2
2015-01-05 03:00:00,28.05,0,15,212,0.0,96,1035,0.3
2015-01-05 04:00:00,28.04,0,14,207,0.0,96,1035,0.4
...,...,...,...,...,...,...,...,...
2020-12-31 19:00:00,61.51,4,10,217,0.3,94,1000,1.9
2020-12-31 20:00:00,56.79,4,12,225,0.2,93,1000,2.0
2020-12-31 21:00:00,52.44,4,14,234,0.3,93,1001,2.1
2020-12-31 22:00:00,51.86,3,12,269,0.2,92,1003,2.2


# Standardize/Normalize the data

In [31]:
# to standardize, only fit the training data, then transform the whole data.
# later, transform back the result to get the predicted price

In [32]:
# get training dataset
total_train = total[:dt.datetime(2020, 11, 30, 23, 0)]

In [33]:
# standardize day ahead price
dayahead_scaler = StandardScaler().fit(np.reshape(total_train['Day-ahead Price [EUR/MWh]'].values, (-1,1)))
day_ahead_normalized = dayahead_scaler.transform(np.reshape(total['Day-ahead Price [EUR/MWh]'].values, (-1,1)))
total['Day-ahead Price [EUR/MWh]'] = day_ahead_normalized.flatten()

# normalize wind angle by dividing by 360
total['winddirDegree'] = total['winddirDegree']/360

# normalize humidity
total['humidity'] = total['humidity']/100

# normalize precipitation from training data only
precip_max = total_train['precipMM'].max()
precip_min = total_train['precipMM'].min()
total['precipMM'] = (total['precipMM'] - precip_min) / (precip_max - precip_min)

# standardize wind speed
windspeed_scaler = StandardScaler().fit(np.reshape(total_train['windspeedKmph'].values, (-1,1)))
windspeed_normalized = windspeed_scaler.transform(np.reshape(total['windspeedKmph'].values, (-1,1)))
total['windspeedKmph'] = windspeed_normalized.flatten()

# standardize temperature
temp_scaler = StandardScaler().fit(np.reshape(total_train['tempC'].values, (-1,1)))
temp_normalized = temp_scaler.transform(np.reshape(total['tempC'].values, (-1,1)))
total['tempC'] = temp_normalized.flatten()

# standardize pressure
pressure_scaler = StandardScaler().fit(np.reshape(total_train['pressure'].values, (-1,1)))
pressure_normalized = pressure_scaler.transform(np.reshape(total['pressure'].values, (-1,1)))
total['pressure'] = pressure_normalized.flatten()

In [34]:
total

Unnamed: 0_level_0,Day-ahead Price [EUR/MWh],tempC,windspeedKmph,winddirDegree,precipMM,humidity,pressure,time_increment
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-05 00:00:00,-0.243620,-1.722376,-0.422598,0.563889,0.000000,0.96,2.163731,2.4
2015-01-05 01:00:00,-0.316395,-1.722376,-0.297098,0.572222,0.000000,0.96,2.058525,0.1
2015-01-05 02:00:00,-0.439933,-1.722376,-0.171598,0.580556,0.000000,0.96,2.058525,0.2
2015-01-05 03:00:00,-0.625914,-1.722376,-0.046098,0.588889,0.000000,0.96,1.953318,0.3
2015-01-05 04:00:00,-0.626363,-1.722376,-0.171598,0.575000,0.000000,0.96,1.953318,0.4
...,...,...,...,...,...,...,...,...
2020-12-31 19:00:00,0.877205,-1.130468,-0.673599,0.602778,0.054545,0.94,-1.728907,1.9
2020-12-31 20:00:00,0.665169,-1.130468,-0.422598,0.625000,0.036364,0.93,-1.728907,2.0
2020-12-31 21:00:00,0.469755,-1.130468,-0.171598,0.650000,0.054545,0.93,-1.623701,2.1
2020-12-31 22:00:00,0.443699,-1.278445,-0.422598,0.747222,0.036364,0.92,-1.413288,2.2


In [35]:
# save dataframe
total.to_csv('day_ahead.csv')

# Create torch dataset

In [36]:
# sequence length (edit the value for different sequence length)
seq = 36 

In [37]:
delta = pd.Timedelta(seq, unit ='h')
# define 1 hour object for convenience when using datetime as index in the dataframe to not include the last item
hours_12 = pd.Timedelta(12, unit ='h') # used mostly for empty 12 hours 
hour = pd.Timedelta(1, unit ='h')
day = pd.Timedelta(1, unit ='d')

In [38]:
### creating training dataset
train_y_start = dt.datetime(2015, 1, 5, 0, 0) + (delta+hours_12).ceil('1d')
#train_x_start = train_y_start - delta - hours_12
train_end = dt.datetime(2020, 11, 30, 23, 0)

train_x = []
train_y = []
while train_y_start + day - hour <= train_end:
    train_x_start = train_y_start - delta - hours_12
    
    
    #print(train_x_start, train_y_start)
    train_x.append(total[train_x_start:train_x_start+delta - hour].values)
    train_y.append(total[train_y_start:train_y_start+day - hour]['Day-ahead Price [EUR/MWh]'].values)
    
    train_y_start += day
    
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)
print(train_x.shape)
print(train_y.shape)

(2155, 36, 8)
(2155, 24)


In [39]:
train_y

array([[-0.08908577, -0.23778043, -0.20363906, ...,  0.00165841,
         0.16113456,  0.16248224],
       [ 0.08521492, -0.2845002 , -0.27461717, ..., -0.06662434,
         0.0816211 ,  0.09285181],
       [-0.43589024, -0.55134198, -0.5845849 , ..., -0.33166921,
        -0.48934844, -0.5625727 ],
       ...,
       [ 0.31207536,  0.17640833,  0.13597775, ..., -0.26069109,
        -0.2373312 , -0.07291354],
       [-0.0958242 , -0.09088268, -0.22250666, ...,  0.46166848,
         0.35609976,  0.14316541],
       [ 0.2074051 ,  0.04747972,  0.00390455, ..., -0.34963835,
        -0.38243204, -0.43049949]])

In [40]:
### creating testing dataset
test_y_start = dt.datetime(2020, 12, 1, 0, 0)
test_end = dt.datetime(2020, 12, 31, 23, 0)

test_x = []
test_y = []
while test_y_start + day - hour <= test_end:
    test_x_start = test_y_start - delta - hours_12
    
    test_x.append(total[test_x_start:test_x_start+delta - hour].values)
    test_y.append(total[test_y_start:test_y_start+day - hour]['Day-ahead Price [EUR/MWh]'].values)
    
    test_y_start += day

test_x = np.asarray(test_x)
test_y = np.asarray(test_y)
print(test_x.shape)
print(test_y.shape)

(31, 36, 8)
(31, 24)


In [41]:
### all the steps we have taken in preprocessing

# data from 1-1-2015 to 12-31-2020
# remove data between 1-1-2015 to 1-4-2015 since the day ahead data is missing
# for 1 hour missing in day ahead data (because of change in time), used polynomial interpolation with degree 2
# remove rows with same datetime (caused by change in time), take the one with higher day ahead price
# in weather data, label encoded categorical data (wind direction and weather discription)
# created time variables (as described in the paper)
# standardize/normalize features using only training data