# Join the data downloaded from Entso-e to create a unified DataFrame

Create one datframe for day-ahead and one for week-ahead

In [4]:
import pandas as pd
import datetime
import os

from Name_convention_dictionaries import PsrTypeDict

In [5]:
# make sure the data type of the dates is datetime:
dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

## Join all the files for day-ahead

In [6]:
# get a list of all the day-ahead files
folder_name = 'data_day_ahead'
files_in_dir = os.listdir("./"+folder_name+"/")

Goal is to create a DataFrame for each document and process type. This means, all the data from regarding e.g. total load is saved in one big DataFrame:

In [7]:
check_if_loaded = []
list_of_dfs = []

# take the first file in the folder
for i in files_in_dir:
    group_files = []
    
    # check if the file has been used before
    if i == '.DS_Store':
        pass
    
    elif i[:-38] not in check_if_loaded:
        
        # you want to group all files in the folder that belong to the same document and process type
        # search for all files that have the same characters at the beginning of the file name
        for j in files_in_dir:
            
            # if the selected file has the same characters at the beginning of the file name, save the
            # path in the group_files list
            if i[:-38] == j[:-38]:
                group_files.append("./"+folder_name+"/"+j)
        
        # save the first file in the group_files list as a dataframe in memory
        df = pd.read_csv(group_files[0],parse_dates=['Date'], date_parser=dateparse)
        
        # iterate through all the remaining files in the group_files list and store them as a dataframe
        # in memory. Then concatenate all dataframes
        for k in group_files[1:]:
            df2 = pd.read_csv(k,parse_dates=['Date'], date_parser=dateparse)
            df = pd.concat([df,df2])

        # Finally, sort the values in the dataframe by datetime and append the dataframe to the
        # list_of_dfs. This will be used later to merge all dataframes together
        df = df.sort_values(by=["Date"])
        list_of_dfs.append(df)
        
        # Append the first file to the check_if_loaded list to make sure all files with the same document
        # and process type are ignored for the next iteration in the loop
        check_if_loaded.append(i[:-38])

In [8]:
# search for duplicates and get rid of them
for i in list_of_dfs:
    print(i.duplicated().sum())
    i.drop_duplicates(inplace=True)
    print(i.duplicated().sum())

388
0
388
0
388
0
388
0
0
0
0
0
0
0
0
0


## Merge all DataFrames

In [9]:
from functools import reduce

In [10]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), list_of_dfs)

In [11]:
df_merged = df_merged.sort_values(by=["Date"]).reset_index()

In [12]:
# save the new dataframe as a csv file
df_merged.to_csv("Day_ahead_dataset.csv", index=False)

In [13]:
# make sure all columns have been stored
df_merged.columns

Index(['index', 'Date', 'Day ahead/System total load in MAW',
       'Day ahead/Solar in MAW', 'Day ahead/Wind Onshore in MAW',
       'Day ahead/Wind Offshore in MAW', 'Realised/Solar in MAW',
       'Realised/System total load in MAW', 'Realised/Wind Offshore in MAW',
       'Realised/Wind Onshore in MAW'],
      dtype='object')

In [14]:
# check how many nan values are in the DataFrame
df_merged.isnull().sum()

index                                   0
Date                                    0
Day ahead/System total load in MAW     96
Day ahead/Solar in MAW                576
Day ahead/Wind Onshore in MAW         576
Day ahead/Wind Offshore in MAW         96
Realised/Solar in MAW                 603
Realised/System total load in MAW       7
Realised/Wind Offshore in MAW         436
Realised/Wind Onshore in MAW          447
dtype: int64

In [15]:
df_merged.sample(10)

Unnamed: 0,index,Date,Day ahead/System total load in MAW,Day ahead/Solar in MAW,Day ahead/Wind Onshore in MAW,Day ahead/Wind Offshore in MAW,Realised/Solar in MAW,Realised/System total load in MAW,Realised/Wind Offshore in MAW,Realised/Wind Onshore in MAW
158615,158519,2019-07-11 04:45:00,56806.0,1774.0,1626.0,517.0,1967.0,57414.0,275.0,1692.0
14560,14560,2015-06-01 15:00:00,59328.0,8124.0,2243.0,114.0,9470.0,59965.0,779.0,1903.0
78218,78218,2017-03-25 17:30:00,57220.0,106.0,8001.0,1200.0,66.0,53666.0,579.0,8034.0
196614,196518,2020-08-10 00:30:00,35755.0,0.0,5584.0,2413.0,0.0,39470.0,2036.0,5693.0
7180,7180,2015-03-16 18:00:00,66953.0,0.0,13189.0,223.0,0.0,68774.0,867.0,13144.0
36790,36790,2016-01-19 04:30:00,50667.0,0.0,4677.0,2018.0,0.0,57197.0,2597.0,4835.0
6795,6795,2015-03-12 17:45:00,67728.0,2.0,5191.0,126.0,1.0,68276.0,604.0,3949.0
61030,61030,2016-09-27 16:30:00,60320.0,600.0,1824.0,1781.0,578.0,61526.0,2294.0,1551.0
104097,104097,2017-12-20 07:15:00,69437.0,30.0,1290.0,330.0,13.0,69778.0,414.0,987.0
107849,107849,2018-01-28 09:15:00,54699.0,1759.0,25912.0,3812.0,2485.0,52930.0,3602.0,29862.0


In [None]:
df_merge.to_csv('')

## Join all the files for week-ahead predictions

In [16]:
# get a list of all the day-ahead files
folder_name = 'data_week_ahead'
files_in_dir = os.listdir("./"+folder_name+"/")

The task for the week-ahead prediction is much simpler than day-ahead (as there are no actuals or generation values). use a simpler implementation to save some memory:

In [17]:
# create a list of the paths to the files. So they can be saved as DataFrames
list_of_paths = []
for name in files_in_dir:
    list_of_paths.append("./"+folder_name+"/"+name)

# store the DataFrames in memory and concatenate them
df_week = pd.read_csv(list_of_paths[0], parse_dates=['min_date', 'max_date'], date_parser=dateparse)
for file in list_of_paths[1:]:
    df2 = pd.read_csv(file, parse_dates=['min_date', 'max_date'], date_parser=dateparse)
    df_week = pd.concat([df_week, df2])
    
df_week.sort_values(by=['min_date', 'max_date'], inplace=True)

In [18]:
# get rid of all duplicates before saving the data as a csv
df_week.drop_duplicates(inplace=True)

df_week.to_csv('Week_ahead_dataset.csv', index=False)

In [19]:
df_week.sample(10)

Unnamed: 0,min_date,max_date,min_forecast_in_MAW,max_forecast_in_MAW
117,2017-02-18 23:00:00,2017-02-18 23:00:00,47235,61400
114,2020-02-19 23:00:00,2020-02-19 23:00:00,49943,72276
81,2019-01-11 23:00:00,2019-01-11 23:00:00,48879,67323
288,2017-08-08 22:00:00,2017-08-08 22:00:00,41458,65995
362,2017-10-21 22:00:00,2017-10-21 22:00:00,42536,58971
288,2018-08-07 22:00:00,2018-08-07 22:00:00,44897,67574
214,2020-05-29 22:00:00,2020-05-29 22:00:00,41540,61344
247,2019-07-03 22:00:00,2019-07-03 22:00:00,42822,67456
76,2015-04-12 22:00:00,2015-04-12 22:00:00,37916,54532
98,2019-01-28 23:00:00,2019-01-28 23:00:00,47131,71296
