In [2]:
import pytz
import yaml
import requests
import logging
logger = logging.getLogger('log')
logger.setLevel('INFO')
import os
from dateutil.relativedelta import *
#from datetime import datetime
import datetime
import numpy as np
import pandas as pd

In [3]:
downloadpath = 'downloads1/'
archivepath = 'archive1/'
outputpath = 'output1/'
outputfile = 'output1.csv'
if not os.path.exists(downloadpath): os.makedirs(downloadpath)
if not os.path.exists(outputpath): os.makedirs(outputpath)

In [4]:
conf = """
    ENTSO-E: 
        Data_Portal: 
            url_template: https://www.entsoe.eu/fileadmin/template/other/statistical_database/excel.php
            url_params:
                pid: 136
                opt_period: 0
                opt_Month: ''
                opt_Year: ''
                send: send
                opt_Response: 99
                dataindx: 0
            url_dates:
                opt_Month: '{u_start.month}'
                opt_Year: '{u_start.year}'
            x_Month: '{u_start.month}'
            x_Year: '{u_start.year}'
            frequency: M
            start: 2006-01-01
            end: recent
            filetype: xls           
"""
conf = yaml.load(conf)

In [5]:
def make_url(url_template, filetype, source, tech, start, end, session, url_params):
    """construct URLs from a template, filling in start- and enddates and call download funtion."""    
    filename = source+'_'+tech+'_'+start.strftime('%Y-%m-%d')+'_'+end.strftime('%Y-%m-%d')

    conf['ENTSO-E']['Data_Portal']['url_params']['opt_Month'] = (
        conf['ENTSO-E']['Data_Portal']['x_Month'].format(u_start = start, u_end = end)
        )
    conf['ENTSO-E']['Data_Portal']['url_params']['opt_Year'] = (
        conf['ENTSO-E']['Data_Portal']['x_Year'].format(u_start = start, u_end = end)
        )
    resp = session.get(url_template, params=url_params)
    
    original_filename = resp.headers['content-disposition'].split('filename=')[-1].replace('"','').replace(';','')
    logger.info('Attempting download of: %s \n From URL: %s \n original filename: %s', filename, resp.url, original_filename)
    work_file = downloadpath+filename+'.'+filetype
    if os.path.exists(work_file):
        logger.info('Filename already exists. Skip to next.')
    else:
        with open(work_file, 'wb') as output_file:
            for chunk in resp.iter_content(1024):
                output_file.write(chunk)

In [7]:
for source, tech in conf.items():
    for tech, parameter in tech.items():
        session = requests.session()
        g_start = parameter['start']
#        g_start = datetime.date(2015,12,1)
        if parameter['end'] == 'recent':
            g_end = datetime.date(2015,12,31)
        else:
            g_end = param['end']

        break_dates = pd.date_range(start=g_start, end=g_end, freq=parameter['frequency'])
        for date in break_dates:
            p_start = date.replace(day = 1)
            if parameter['frequency'] == 'M':
                p_end = p_start + relativedelta(months = 1, days = -1)
            if parameter['frequency'] == 'Y':
                p_end = p_start + relativedelta(years = 1, days = -1)
                
            make_url(parameter['url_template'], parameter['filetype'], source, tech, p_start, p_end, session, parameter['url_params'])   


INFO:log:Attempting download of: ENTSO-E_Data_Portal_2015-12-01_2015-12-31 
 From URL: https://www.entsoe.eu/fileadmin/template/other/statistical_database/excel.php?opt_Month=12&opt_Year=2015&pid=136&send=send&dataindx=0&opt_period=0&opt_Response=99 
 original filename: Statistics.xls


In [9]:
def readData(filePath, source, tech):
    data = pd.read_excel(
        io = filePath,
        header=9,
#        skiprows = None,
        index_col = [0,1],
#        parse_cols = None #None means: parse all columns
        )
    
#   #Create a list of the dst-transistion hours
    dst_transition_times = [d.replace(hour=2) for d in pytz.timezone('Europe/Berlin')._utc_transition_times[1:]]
    
    #the original data has days and countries in the rows and hours in the columns.
    #this rearranges the table, mapping hours on the rows and countries on the columns 
    data = data.stack(level=None).unstack(level='Country').reset_index()    
    #pythons DataFrame.stack() puts former columnnames in a new index object named after their level
    data.rename(columns={'level_1': 'raw_hour'}, inplace=True)
    
    #truncate the hours column and replace letters (incating which is which during fall dst-transition)
    #hours are indexed 1-24 rather then 0-23, so we deduct 1
    data['hour'] = (data['raw_hour'].str[:2].str.replace('A','').str.replace('B','').astype(int) - 1).astype(str)    
    data['dt_index'] = pd.to_datetime(data['Day']+' '+data['hour']+':00')
    data.set_index('dt_index', inplace=True)    
    
    # drop 2nd occurence of 03:00 appearing in October data except for autumn dst-transition
    data = data[~((data['raw_hour'] == '3B:00:00') & ~(data.index.isin(dst_transition_times)))]
    #drop 03:00 for (spring) dst-transition. October data is unaffected because the format is 3A:00/3B:00 
    data = data[~((data['raw_hour'] == '03:00:00') & (data.index.isin(dst_transition_times)))]
    
    data.index = data.index.tz_localize('Europe/Berlin', ambiguous='infer')
#    data = data.drop(['Day', 'hour', 'raw_hour'], axis=1, inplace = True)
    data.rename(columns=lambda x: 'load_'+x, inplace=True)
    return data

In [10]:
resultDataSet = pd.DataFrame()
for source, tech in conf.items():
    for tech, param in tech.items():
        for filename in os.listdir(downloadpath):
            if source in filename and tech in filename:
                logger.info('reading %s', filename)
                dataToAdd = readData(downloadpath + filename, source, tech)
                resultDataSet = resultDataSet.combine_first(dataToAdd)

INFO:log:reading ENTSO-E_Data_Portal_2015-12-01_2015-12-31.xls




In [11]:
resultDataSet

Country,load_Day,load_raw_hour,load_CH,load_CZ,load_DK,load_HR,load_IS,load_NO,load_PT,load_SK,load_hour
dt_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-12-01 00:00:00+01:00,2015-12-01,01:00:00,6629,6914,3325,1654,2107,16166,5677,3153,0
2015-12-01 01:00:00+01:00,2015-12-01,02:00:00,6784,6934,3243,1503,2085,15908,5203,3039,1
2015-12-01 02:00:00+01:00,2015-12-01,03:00:00,6787,6789,3218,1428,2073,15601,4940,2971,2
2015-12-01 03:00:00+01:00,2015-12-01,04:00:00,6872,6731,3226,1412,2066,15646,4772,2985,3
2015-12-01 04:00:00+01:00,2015-12-01,05:00:00,6774,6836,3282,1444,2062,15770,4634,3060,4
2015-12-01 05:00:00+01:00,2015-12-01,06:00:00,6796,7334,3526,1613,2068,16260,4695,3262,5
2015-12-01 06:00:00+01:00,2015-12-01,07:00:00,7487,8457,4423,1997,2101,17711,4834,3674,6
2015-12-01 07:00:00+01:00,2015-12-01,08:00:00,8464,8866,5193,2203,2159,19494,5230,3833,7
2015-12-01 08:00:00+01:00,2015-12-01,09:00:00,8787,8983,5175,2317,2212,20120,5775,3917,8
2015-12-01 09:00:00+01:00,2015-12-01,10:00:00,8755,9033,5293,2336,2251,19921,6696,3973,9


In [649]:
resultDataSet.to_csv(outputpath+outputfile, sep=';', float_format='%.2f', decimal=',')