# Join the data downloaded from Entso-e to create a unified DataFrame

Create one datframe for day-ahead and one for week-ahead

In [None]:
import pandas as pd
import datetime
import os

from Name_convention_dictionaries import PsrTypeDict

In [None]:
# make sure the data type of the dates is datetime:
dateparse = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

In [None]:
filename[:-38]

## Join all the files for day-ahead

In [None]:
# get a list of all the day-ahead files
folder_name = 'data_day_ahead'
files_in_dir = os.listdir("./"+folder_name+"/")

Goal is to create a DataFrame for each document and process type. This means, all the data from regarding e.g. total load is saved in one big DataFrame:

In [None]:
check_if_loaded = []
list_of_dfs = []

# take the first file in the folder
for i in files_in_dir:
    group_files = []
    
    # check if the file has been used before
    if i == '.DS_Store':
        pass
    
    elif i[:-38] not in check_if_loaded:
        
        # you want to group all files in the folder that belong to the same document and process type
        # search for all files that have the same characters at the beginning of the file name
        for j in files_in_dir:
            
            # if the selected file has the same characters at the beginning of the file name, save the
            # path in the group_files list
            if i[:-38] == j[:-38]:
                group_files.append("./"+folder_name+"/"+j)
        
        # save the first file in the group_files list as a dataframe in memory
        df = pd.read_csv(group_files[0],parse_dates=['Date'], date_parser=dateparse)
        
        # iterate through all the remaining files in the group_files list and store them as a dataframe
        # in memory. Then concatenate all dataframes
        for k in group_files[1:]:
            df2 = pd.read_csv(k,parse_dates=['Date'], date_parser=dateparse)
            df = pd.concat([df,df2])

        # Finally, sort the values in the dataframe by datetime and append the dataframe to the
        # list_of_dfs. This will be used later to merge all dataframes together
        df = df.sort_values(by=["Date"])
        list_of_dfs.append(df)
        
        # Append the first file to the check_if_loaded list to make sure all files with the same document
        # and process type are ignored for the next iteration in the loop
        check_if_loaded.append(i[:-38])

In [None]:
# search for duplicates and get rid of them
for i in list_of_dfs:
    print(i.duplicated().sum())
    i.drop_duplicates(inplace=True)
    print(i.duplicated().sum())

## Merge all DataFrames

In [None]:
from functools import reduce

In [None]:
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), list_of_dfs)

In [None]:
df_merged = df_merged.sort_values(by=["Date"]).reset_index()

In [None]:
# save the new dataframe as a csv file
df_merged.to_csv("Day_ahead_dataset.csv", index=False)

In [None]:
# make sure all columns have been stored
df_merged.columns

In [None]:
# check how many nan values are in the DataFrame
df_merged.isnull().sum()

In [None]:
df_merged.sample(10)

## Join all the files for week-ahead predictions

In [None]:
# get a list of all the day-ahead files
folder_name = 'data_week_ahead'
files_in_dir = os.listdir("./"+folder_name+"/")

The task for the week-ahead prediction is much simpler than day-ahead (as there are no actuals or generation values). use a simpler implementation to save some memory:

In [None]:
# create a list of the paths to the files. So they can be saved as DataFrames
list_of_paths = []
for name in files_in_dir:
    list_of_paths.append("./"+folder_name+"/"+name)

# store the DataFrames in memory and concatenate them
df_week = pd.read_csv(list_of_paths[0], parse_dates=['min_date', 'max_date'], date_parser=dateparse)
for file in list_of_paths[1:]:
    df2 = pd.read_csv(file, parse_dates=['min_date', 'max_date'], date_parser=dateparse)
    df_week = pd.concat([df_week, df2])
    
df_week.sort_values(by=['min_date', 'max_date'], inplace=True)

In [None]:
# get rid of all duplicates before saving the data as a csv
df_week.drop_duplicates(inplace=True)

df_week.to_csv('Week_ahead_dataset.csv', index=False)

In [None]:
df_week.sample(10)