In [1]:
import numpy as np
import pandas as pd

In [2]:
!pip3 install graycode
import graycode



# Day ahead prices data

In [3]:
price_2015 = pd.read_csv('data/Day-ahead Prices_201501010000-201601010000.csv') 
price_2015 = price_2015.iloc[96:] # data of january 1 to 4 is missing so remove them
price_2016 = pd.read_csv('data/Day-ahead Prices_201601010000-201701010000.csv') 
price_2017 = pd.read_csv('data/Day-ahead Prices_201701010000-201801010000.csv') 
price_2018 = pd.read_csv('data/Day-ahead Prices_201801010000-201901010000.csv') 
price_2019 = pd.read_csv('data/Day-ahead Prices_201901010000-202001010000.csv') 
price_2020 = pd.read_csv('data/Day-ahead Prices_202001010000-202101010000.csv') 

In [4]:
# create 1 day ahead prices by combining all the years
day_ahead = pd.concat([price_2015, price_2016, price_2017, price_2018, price_2019, price_2020]).reset_index(drop=True)

In [5]:
day_ahead

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh]
0,05.01.2015 00:00 - 05.01.2015 01:00,36.56
1,05.01.2015 01:00 - 05.01.2015 02:00,34.94
2,05.01.2015 02:00 - 05.01.2015 03:00,32.19
3,05.01.2015 03:00 - 05.01.2015 04:00,28.05
4,05.01.2015 04:00 - 05.01.2015 05:00,28.04
...,...,...
52513,31.12.2020 19:00 - 31.12.2020 20:00,61.51
52514,31.12.2020 20:00 - 31.12.2020 21:00,56.79
52515,31.12.2020 21:00 - 31.12.2020 22:00,52.44
52516,31.12.2020 22:00 - 31.12.2020 23:00,51.86


In [6]:
day_ahead[day_ahead.isnull().any(axis=1)]

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh]
1994,29.03.2015 02:00 - 29.03.2015 03:00,
10731,27.03.2016 02:00 - 27.03.2016 03:00,
19468,26.03.2017 02:00 - 26.03.2017 03:00,
28205,25.03.2018 02:00 - 25.03.2018 03:00,
37110,31.03.2019 02:00 - 31.03.2019 03:00,
45847,29.03.2020 02:00 - 29.03.2020 03:00,


In [7]:
# interpolate missing day ahead price, for now using polynomial interpolation with degree 2
day_ahead['Day-ahead Price [EUR/MWh]'] = day_ahead['Day-ahead Price [EUR/MWh]'].interpolate(method='polynomial', order=2)

In [8]:
day_ahead['datetime'] = day_ahead['MTU (CET)'].str.split('-').str[0]
day_ahead['datetime'] = pd.to_datetime(day_ahead['datetime'], format="%d.%m.%Y %H:%M ")

In [9]:
day_ahead

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh],datetime
0,05.01.2015 00:00 - 05.01.2015 01:00,36.56,2015-01-05 00:00:00
1,05.01.2015 01:00 - 05.01.2015 02:00,34.94,2015-01-05 01:00:00
2,05.01.2015 02:00 - 05.01.2015 03:00,32.19,2015-01-05 02:00:00
3,05.01.2015 03:00 - 05.01.2015 04:00,28.05,2015-01-05 03:00:00
4,05.01.2015 04:00 - 05.01.2015 05:00,28.04,2015-01-05 04:00:00
...,...,...,...
52513,31.12.2020 19:00 - 31.12.2020 20:00,61.51,2020-12-31 19:00:00
52514,31.12.2020 20:00 - 31.12.2020 21:00,56.79,2020-12-31 20:00:00
52515,31.12.2020 21:00 - 31.12.2020 22:00,52.44,2020-12-31 21:00:00
52516,31.12.2020 22:00 - 31.12.2020 23:00,51.86,2020-12-31 22:00:00


In [10]:
day_ahead[day_ahead.isnull().any(axis=1)]

Unnamed: 0,MTU (CET),Day-ahead Price [EUR/MWh],datetime


# Weather data

In [12]:
weather = pd.read_json('data/parsed_weather.json')

In [13]:
weather

Unnamed: 0,date,time,tempC,windspeedKmph,winddirDegree,winddir16Point,weatherDesc,precipMM,humidity,pressure,HeatIndexC,FeelsLikeC
0,2015-01-01,0,0,12,226,SW,[Mist],0.0,96,1037,1,-3
1,2015-01-01,100,0,13,217,SW,[Mist],0.0,95,1036,0,-4
2,2015-01-01,200,0,15,209,SSW,[Clear],0.0,95,1035,0,-4
3,2015-01-01,300,0,16,200,SSW,[Clear],0.0,95,1034,0,-5
4,2015-01-01,400,0,16,201,SSW,[Clear],0.0,94,1035,0,-5
...,...,...,...,...,...,...,...,...,...,...,...,...
54019,2021-02-28,1900,9,14,61,ENE,[Clear],0.0,71,1034,9,7
54020,2021-02-28,2000,8,14,65,ENE,[Clear],0.0,74,1034,8,6
54021,2021-02-28,2100,8,14,69,ENE,[Clear],0.0,78,1034,8,5
54022,2021-02-28,2200,7,13,72,ENE,[Clear],0.0,79,1034,7,5


In [14]:
# remove data outside of january 5 2015 to december 31 2020
weather['date'] = pd.to_datetime(weather['date'])
start_date = '2015-1-4'
end_date = '2020-12-31'
mask = (weather['date'] > start_date) & (weather['date'] <= end_date)
weather = weather.loc[mask]
weather['datetime'] = pd.to_datetime(weather['date'] + pd.to_timedelta(weather['time']/100, unit='H'))
weather = weather.replace({'time': {0: 2400}}) # convert 0 to 2400, for later convenience

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather['datetime'] = pd.to_datetime(weather['date'] + pd.to_timedelta(weather['time']/100, unit='H'))


In [15]:
# incremental representation of time
weather['time_increment'] = weather['time'] / 1000

In [16]:
time = (weather['time'].values/100).astype(int)
time_exclusive = np.zeros(shape=len(time), dtype=object)
time_gray = np.zeros(shape=len(time), dtype=object)

for i in range(len(time)):
    time_gray[i] = '{:05b}'.format(graycode.tc_to_gray_code(time[i]))
    time_exclusive[i] = (24 - time[i]) * '0' + '1' + (time[i]-1) * '0'

# gray code and mutually exclusive binary representation of time
weather['time_graycode'] = time_gray
weather['time_exclusive'] = time_exclusive

In [17]:
weather

Unnamed: 0,date,time,tempC,windspeedKmph,winddirDegree,winddir16Point,weatherDesc,precipMM,humidity,pressure,HeatIndexC,FeelsLikeC,datetime,time_increment,time_graycode,time_exclusive
96,2015-01-05,2400,0,12,203,SSW,[Partly cloudy],0.0,96,1037,0,-4,2015-01-05 00:00:00,2.4,10100,100000000000000000000000
97,2015-01-05,100,0,13,206,SSW,[Partly cloudy],0.0,96,1036,0,-4,2015-01-05 01:00:00,0.1,00001,000000000000000000000001
98,2015-01-05,200,0,14,209,SSW,[Partly cloudy],0.0,96,1036,0,-4,2015-01-05 02:00:00,0.2,00011,000000000000000000000010
99,2015-01-05,300,0,15,212,SSW,[Partly cloudy],0.0,96,1035,0,-5,2015-01-05 03:00:00,0.3,00010,000000000000000000000100
100,2015-01-05,400,0,14,207,SSW,[Partly cloudy],0.0,96,1035,0,-5,2015-01-05 04:00:00,0.4,00110,000000000000000000001000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52603,2020-12-31,1900,4,10,217,SW,[Light rain],0.3,94,1000,4,2,2020-12-31 19:00:00,1.9,11010,000001000000000000000000
52604,2020-12-31,2000,4,12,225,SW,[Light rain],0.2,93,1000,4,1,2020-12-31 20:00:00,2.0,11110,000010000000000000000000
52605,2020-12-31,2100,4,14,234,SW,[Light rain],0.3,93,1001,4,0,2020-12-31 21:00:00,2.1,11111,000100000000000000000000
52606,2020-12-31,2200,3,12,269,W,[Light rain],0.2,92,1003,3,0,2020-12-31 22:00:00,2.2,11101,001000000000000000000000


In [18]:
# no NaN
weather[weather.isnull().any(axis=1)]

Unnamed: 0,date,time,tempC,windspeedKmph,winddirDegree,winddir16Point,weatherDesc,precipMM,humidity,pressure,HeatIndexC,FeelsLikeC,datetime,time_increment,time_graycode,time_exclusive


In [19]:
# other preprocessing
# - convert non-numerical variables to numerical
# - day ahead and weather does not have the same number of rows ...

In [50]:
total = pd.merge(day_ahead, weather, how='outer', on='datetime')

In [55]:
# some hours have multiple because of summer/winter time changes
total['datetime'].value_counts()

2017-10-29 02:00:00    2
2018-10-28 02:00:00    2
2016-10-30 02:00:00    2
2020-10-25 02:00:00    2
2015-10-25 02:00:00    2
                      ..
2020-06-24 23:00:00    1
2017-03-09 12:00:00    1
2018-06-17 06:00:00    1
2020-09-28 06:00:00    1
2019-06-15 03:00:00    1
Name: datetime, Length: 52512, dtype: int64