In [None]:
#default_exp data.datasets.gefcom2014

# GEFCom 2014 dataset

> Download the GEFCom 2024 dataset.

[Tao Hong, Pierre Pinson, Shu Fan, Hamidreza Zareipour, Alberto Troccoli and Rob J. Hyndman, "Probabilistic energy forecasting: Global Energy Forecasting Competition 2014 and beyond", International Journal of Forecasting, vol.32, no.3, pp 896-913, July-September, 2016.](https://www.sciencedirect.com/science/article/pii/S0169207016000133)

In [None]:
#export
import os
import re
import logging
import zipfile

from dataclasses import dataclass
from typing import Tuple

import numpy as np
import pandas as pd

from neuralforecast.data.datasets.utils import (
    download_file, 
    Info, 
    create_calendar_variables,
    create_us_holiday_distance_variables,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
import matplotlib.pyplot as plt
from matplotlib import rcParams
plt.rcParams['font.family'] = 'serif'
FONTSIZE = 22

>GEFCom2014 meta inforamtion

In [None]:
#export
@dataclass
class Extended:
    test_date: str = '2016-12-27'
    name: str = 'Extended'
    freq: str = 'Y'

@dataclass
class Load:
    test_date: str = '2016-12-27'
    name: str = 'Load'
    freq: str = 'H'

@dataclass
class Price:
    test_date: str = '2015-01-04'
    name: str = 'Price'
    freq: str = 'H'

@dataclass
class Solar:
    test_date: str = '2015-01-04'
    name: str = 'Solar'
    freq: str = 'H'

@dataclass
class Wind:
    test_date: str = '2016-01-04'
    name: str = 'Wind'
    freq: str = 'H'

In [None]:
#export
GEFCom2014Info = Info(groups=('E_V2', 'L_V2', 'P_V2', 'S_V2', 'W_V2'),
                      class_groups=(Extended, Load, Price, Solar, Wind))

class GEFCom2014:
    
    source_url = 'https://www.dropbox.com/s/pqenrr2mcvl0hk9/GEFCom2014.zip?dl=1'
    
    @staticmethod
    def unzip_wind(directory: str) -> None:
        """
        Downloads wind data from GEFCom2014 Dataset.
        
        Parameters
        ----------
        directory: str
            Directory path where dataset is downloaded.
        """

        path = f'{directory}/gefcom2014'
        windpath = f'{path}/Wind'
        for task_number in range(1, 16):
            unzipdir = f'{windpath}/Task {task_number}'
            ypath = f'{unzipdir}/Task{task_number}_W_Zone1_10.zip'
            xpath = f'{unzipdir}/TaskExpVars{task_number}_W_Zone1_10.zip'
            
            with zipfile.ZipFile(ypath, 'r') as zip_ref:
                zip_ref.extractall(path=unzipdir)
            
            with zipfile.ZipFile(xpath, 'r') as zip_ref:
                zip_ref.extractall(path=unzipdir)
        
        logger.info(f'Successfully decompressed Wind tasks')
    
    @staticmethod
    def unzip(path: str) -> None:
        """
        Unzip compressed file.
        
        Parameters
        ----------
        path: str
            Path to file.
        """
    
        # Unzip Load, Price, Solar and Wind data
        for group in GEFCom2014Info.groups:
            filepath = f'{path}/GEFCom2014 Data/GEFCom2014-{group}.zip'
            with zipfile.ZipFile(filepath, 'r') as zip_ref:
                zip_ref.extractall(path)
                logger.info(f'Successfully decompressed {filepath}')

    @staticmethod
    def download(directory: str) -> None:
        """
        Downloads GEFCom2014 Dataset.
        
        Parameters
        ----------
        directory: str
            Directory path to download dataset.
        """
        path = f'{directory}/gefcom2014'
        if not os.path.exists(path):
            download_file(directory=path, 
                          source_url=GEFCom2014.source_url,
                          decompress=True)
            
            GEFCom2014.unzip(path)

In [None]:
GEFCom2014.download(f'data')

# GEFCom2014-L: Original Electricity Load Task Datasets
- Y: 5 years of hourly load data (augmented with tasks).
- X: 11 years of 25 weather stations to be filtered (augmented with tasks).
- Tasks: Fifteen one-month-ahead forecast quantiles of hourly loads on rolling basis.

>GEFCom2014-L meta information

In [None]:
#export
@dataclass
class LoadTask1:
    test_start: str = '10/01/2010'
    test_end: str = '11/01/2010'

@dataclass
class LoadTask2:
    test_start: str = '11/01/2010'
    test_end: str = '12/01/2010'

@dataclass
class LoadTask3:
    test_start: str = '12/01/2010'
    test_end: str = '01/01/2011'
        
@dataclass
class LoadTask4:
    test_start: str = '01/01/2011'
    test_end: str = '02/01/2011'
        
@dataclass
class LoadTask5:
    test_start: str = '02/01/2011'
    test_end: str = '03/01/2011'

@dataclass
class LoadTask6:
    test_start: str = '03/01/2011'
    test_end: str = '04/01/2011'
        
@dataclass
class LoadTask7:
    test_start: str = '04/01/2011'
    test_end: str = '05/01/2011'
    
@dataclass
class LoadTask8:
    test_start: str = '05/01/2011'
    test_end: str = '06/01/2011'
        
@dataclass
class LoadTask9:
    test_start: str = '06/01/2011'
    test_end: str = '07/01/2011'
        
@dataclass
class LoadTask10:
    test_start: str = '07/01/2011'
    test_end: str = '08/01/2011'
        
@dataclass
class LoadTask11:
    test_start: str = '08/01/2011'
    test_end: str = '09/01/2011'
        
@dataclass
class LoadTask12:
    test_start: str = '09/01/2011'
    test_end: str = '10/01/2011'
        
@dataclass
class LoadTask13:
    test_start: str = '10/01/2011'
    test_end: str = '11/01/2011'
        
@dataclass
class LoadTask14:
    test_start: str = '11/01/2011'
    test_end: str = '12/01/2011'
        
@dataclass
class LoadTask15:
    test_start: str = '12/01/2011'
    test_end: str = '01/01/2012'
        
@dataclass
class LoadTask16:
    test_start: str = '01/01/2012'
    test_end: str = '02/01/2012'


LOAD_START = '01/01/2005'
LOAD_TASKS = ['Task '+str(k) for k in range(1, 17)]
GEFCom2014_L_Info = Info(groups=LOAD_TASKS,
                         class_groups=[LoadTask1, LoadTask2, LoadTask3, LoadTask4,
                                       LoadTask5, LoadTask6, LoadTask7, LoadTask8,
                                       LoadTask9, LoadTask10, LoadTask11, LoadTask12,
                                       LoadTask13, LoadTask14, LoadTask15, LoadTask16])

In [None]:
#export
class GEFCom2014_L:
    
    @staticmethod
    def read_train_df(directory: str, group: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load train dataset.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y'].  
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y'].       
        """

        # Meta data
        path = f'{directory}/gefcom2014'
        group_info = GEFCom2014_L_Info.get_group(group)
        
        # Cumulative data from previous tasks
        previous_load_tasks = LOAD_TASKS[:LOAD_TASKS.index(group)+1]
        train_dfs = []
        for task in previous_load_tasks:
            task_number = re.findall("\d+", task)[0]
            
            if task!='Task 16':
                filepath = f'{path}/Load/Task {task_number}/L{task_number}-train.csv'
                df = pd.read_csv(filepath, index_col=None, header=0)
            else:
                loadpath = f'{path}/Load/Solution to Task 15/solution15_L.csv'
                load_df = pd.read_csv(loadpath, index_col=None, header=0)
                weatherpath = f'{path}/Load/Solution to Task 15/solution15_L_temperature.csv'
                df = pd.read_csv(weatherpath, index_col=None, header=0)
                df['LOAD'] = load_df['LOAD']
            train_dfs.append(df)
        
        # Train data
        train_df = pd.concat(train_dfs, axis=0, ignore_index=True)
        available = ~train_df['LOAD'].isnull()
        train_df = train_df[available] # Filter load null values
        train_df.reset_index(drop=True, inplace=True)
        train_df = train_df.rename(columns={'ZONEID': 'unique_id', 'LOAD': 'y'})
        train_df['ds'] = pd.date_range(start=LOAD_START,
                                       end=group_info.test_start, freq='H', closed='right')
        
        Y_df = train_df[['unique_id', 'ds', 'y']].copy()
        X_df = train_df.drop(['y', 'TIMESTAMP'], axis=1)
        X_df = create_calendar_variables(X_df=X_df)
        X_df = create_us_holiday_distance_variables(X_df=X_df)
        return Y_df, X_df
    
    @staticmethod
    def read_benchmark_df(directory: str, group: str) -> pd.DataFrame:
        """Load benchmark time series.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2012 dataset.   
        """

        assert group!='Task 16', 'No available benchmark'
        
        # Meta data
        path = f'{directory}/gefcom2014'
        task_number = re.findall("\d+", group)[0]
        group_info = GEFCom2014_L_Info.get_group(group)
        
        # Benchmark data
        filepath = f'{path}/Load/Task {task_number}/L{task_number}-benchmark.csv'
        benchmark_df = pd.read_csv(filepath, index_col=None, header=0)
        benchmark_df['ds'] = pd.date_range(start=group_info.test_start,
                                           end=group_info.test_end, freq='H', closed='right')
        
        benchmark_df = benchmark_df.drop('TIMESTAMP', axis=1)
        benchmark_df = benchmark_df.rename(columns={'ZONEID': 'unique_id'})
                
        # complete benchmark data with the target variable for task evaluation
        next_task_number = int(re.findall("\d+", group)[0])+1
        next_group = 'Task ' + str(next_task_number)

        Y_true_df, _ = GEFCom2014_L.read_train_df(directory, next_group)
        ds_filter = (Y_true_df['ds'] >= benchmark_df.ds.min()) & (Y_true_df['ds'] <= benchmark_df.ds.max())
        benchmark_df['y'] = Y_true_df[ds_filter].y.values
        return benchmark_df

    @staticmethod
    def load(directory: str,
             group: str) -> Tuple[pd.DataFrame, 
                                  pd.DataFrame, 
                                  pd.DataFrame]:
        """
        Downloads and loads GEFCom2014-L data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.

        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y']. 
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y']. 
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2014 dataset.  
        """
        path = f'{directory}/gefcom2014'
        GEFCom2014.download(directory)
        
        Y_df, X_df = GEFCom2014_L.read_train_df(directory, group)
        benchmark_df = GEFCom2014_L.read_benchmark_df(directory, group)
        return Y_df, X_df, benchmark_df

In [None]:
Y_df, X_df, benchmark_df = GEFCom2014_L.load(directory='data', group='Task 14')

#ds = Y_df.ds.values[365*24:]
#y_true = Y_df.y.values[365*24:]
ds = Y_df.ds.values[-740:]
y_true = Y_df.y.values[-740:]

x_plot = Y_df.ds.values
x_plot_min = pd.to_datetime(x_plot.min()).strftime('%B %d, %Y')
x_plot_max = pd.to_datetime(x_plot.max()).strftime('%B %d, %Y')
x_axis_str = f'Hours [{x_plot_min}  to  {x_plot_max}]'
y_axis_str = 'Load (MW)'

fig = plt.figure(figsize=(15, 4))
fig.tight_layout()
ax0 = plt.subplot2grid((1,1),(0, 0))
axs = [ax0]

axs[0].plot(ds, y_true, color='#628793', linewidth=0.4, label='true')
axs[0].tick_params(labelsize=FONTSIZE-5)
axs[0].set_xlabel(x_axis_str, fontsize=FONTSIZE)
axs[0].set_ylabel(y_axis_str, fontsize=FONTSIZE)
plt.ylim(50, 350)
plt.title('GEFCom2014-L', fontsize=FONTSIZE)
plt.grid()
plt.show()

# GEFCom2014-E: Extended 2015 Electricity Load Task Datasets
- Y: Four years of hourly load data (augmented with tasks).
- X: Six years of hourly temperature (augmented with tasks).
- Task: Five one-year-ahead forecast quantiles of hourly loads on rolling basis.

In [None]:
#export
class GEFCom2014_E:
    
    @staticmethod
    def load(directory: str) -> pd.DataFrame:
        """
        Downloads and loads GEFCom2014-E data.
        This dataset is an extension to the GEFCom2014-L data

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.

        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y']. 
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y']. 
        """
        path = f'{directory}/gefcom2014'
        GEFCom2014.download(directory)
        
        filepath = f'{path}/GEFCom2014-E.xlsx'
        df = pd.read_excel(filepath)

        # create timestamp variable from Date and Hour
        df['ds'] = df['Date'].add(pd.to_timedelta(df.Hour - 1, unit='h'))
        df['unique_id'] = 1
        df = df.rename(columns={'T':'temp', 'load':'y'})

        # create Y_df and X_df
        df = df[df.ds >= '2006-01-01'] # remove time period with no load data
        Y_df = df[['unique_id', 'ds', 'y']]

        X_df = df.drop(['y', 'Date', 'Hour'], axis=1)
        X_df = create_calendar_variables(X_df=X_df)
        return Y_df, X_df

In [None]:
Y_df, X_df = GEFCom2014_E.load(directory='data')

ds = Y_df.ds.values
y_true = Y_df.y.values

x_plot = Y_df.ds.values
x_plot_min = pd.to_datetime(x_plot.min()).strftime('%B %d, %Y')
x_plot_max = pd.to_datetime(x_plot.max()).strftime('%B %d, %Y')
x_axis_str = f'Hours [{x_plot_min}  to  {x_plot_max}]'
y_axis_str = 'Load (MW)'

fig = plt.figure(figsize=(15, 4))
fig.tight_layout()
ax0 = plt.subplot2grid((1,1),(0, 0))
axs = [ax0]

axs[0].plot(ds, y_true, color='#628793', linewidth=0.4, label='true')
axs[0].tick_params(labelsize=FONTSIZE-5)
axs[0].set_xlabel(x_axis_str, fontsize=FONTSIZE)
axs[0].set_ylabel(y_axis_str, fontsize=FONTSIZE)
plt.ylim(1800, 5800)
plt.title('GEFCom2014-E', fontsize=FONTSIZE)
plt.grid()
plt.show()

# GEFCOM2014-P: Electricity Price Task Datasets
- Y: Hourly electricity price
- X: Zonal and system load day-ahead forecasts
- Tasks: Fifteen one-day-ahead forecast quantiles of hourly price on rolling basis.

>GEFcom2014-P meta information

In [None]:
#export
@dataclass
class PriceTask1:
    test_start: str = '06/16/2013'
    test_end: str = '06/17/2013'

@dataclass
class PriceTask2:
    test_start: str = '06/17/2013'
    test_end: str = '06/18/2013'

@dataclass
class PriceTask3:
    test_start: str = '06/24/2013'
    test_end: str = '06/25/2013'
        
@dataclass
class PriceTask4:
    test_start: str = '07/04/2013'
    test_end: str = '07/05/2013'
        
@dataclass
class PriceTask5:
    test_start: str = '07/09/2013'
    test_end: str = '07/10/2013'

@dataclass
class PriceTask6:
    test_start: str = '07/13/2013'
    test_end: str = '07/14/2013'
        
@dataclass
class PriceTask7:
    test_start: str = '07/16/2013'
    test_end: str = '07/17/2013'
    
@dataclass
class PriceTask8:
    test_start: str = '07/18/2013'
    test_end: str = '07/19/2013'

@dataclass
class PriceTask9:
    test_start: str = '07/19/2013'
    test_end: str = '07/20/2013'
        
@dataclass
class PriceTask10:
    test_start: str = '07/20/2013'
    test_end: str = '07/21/2013'
        
@dataclass
class PriceTask11:
    test_start: str = '07/24/2013'
    test_end: str = '07/25/2013'
        
@dataclass
class PriceTask12:
    test_start: str = '07/25/2013'
    test_end: str = '07/26/2013'
        
@dataclass
class PriceTask13:
    test_start: str = '12/07/2013'
    test_end: str = '12/08/2013'
        
@dataclass
class PriceTask14:
    test_start: str = '12/08/2013'
    test_end: str = '12/09/2013'
        
@dataclass
class PriceTask15:
    test_start: str = '12/17/2013'
    test_end: str = '12/18/2013'


PRICE_START = '01/01/2011'
PRICE_TASKS = ['Task '+str(k) for k in range(1, 16)]
GEFCom2014_P_Info = Info(groups=PRICE_TASKS,
                         class_groups=[PriceTask1, PriceTask2, PriceTask3, PriceTask4,
                                       PriceTask5, PriceTask6, PriceTask7, PriceTask8,
                                       PriceTask9, PriceTask10, PriceTask11, PriceTask12,
                                       PriceTask13, PriceTask14, PriceTask15])

In [None]:
#export
class GEFCom2014_P:
    
    @staticmethod
    def read_train_df(directory: str, group: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load train dataset.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y'].  
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y'].       
        """

        # Meta data
        path = f'{directory}/gefcom2014'
        group_info = GEFCom2014_P_Info.get_group(group)
        task_number = re.findall("\d+", group)[0]
        filepath = f'{path}/Price/Task {task_number}/Task{task_number}_P.csv'
        
        # Train data
        train_df = pd.read_csv(filepath, index_col=None, header=0)
        train_df.reset_index(drop=True, inplace=True)
        train_df = train_df.rename(columns={'ZONEID': 'unique_id', 'Zonal Price': 'y'})    
        train_df['ds'] = pd.date_range(start=PRICE_START,
                                       end=group_info.test_end, freq='H', closed='right')
        
        Y_df = train_df[['unique_id', 'ds', 'y']].copy()
        X_df = train_df.drop(['y', 'timestamp'], axis=1)
        X_df = create_calendar_variables(X_df=X_df)
        X_df = create_us_holiday_distance_variables(X_df=X_df)
        return Y_df, X_df
    
    @staticmethod
    def read_benchmark_df(directory: str, group: str) -> pd.DataFrame:
        """Load benchmark time series.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2012 dataset.   
        """

        # Meta data
        path = f'{directory}/gefcom2014'
        group_info = GEFCom2014_P_Info.get_group(group)
        task_number = re.findall("\d+", group)[0]
        filepath = f'{path}/Price/Task {task_number}/Benchmark{task_number}_P.csv'
        
        if group=='Task 7':
            filepath = f'{path}/Price/Task {task_number}/Benchmark{task_number}_P_new3.csv'
        
        benchmark_df = pd.read_csv(filepath, index_col=None, header=0)
        benchmark_df['ds'] = pd.date_range(start=group_info.test_start,
                                           end=group_info.test_end, freq='H', closed='right')

        benchmark_df = benchmark_df.drop('timestamp', axis=1)
        benchmark_df = benchmark_df.rename(columns={'ZONEID': 'unique_id'})
        
        # complete benchmark data with the target variable for task evaluation
        if group!='Task 15':
            next_task_number = int(re.findall("\d+", group)[0])+1
            next_group = 'Task ' + str(next_task_number)
            
            Y_true_df, _ = GEFCom2014_P.read_train_df(directory, next_group)
            ds_filter = (Y_true_df['ds'] >= benchmark_df.ds.min()) & (Y_true_df['ds'] <= benchmark_df.ds.max())
            benchmark_df['y'] = Y_true_df[ds_filter].y.values
        return benchmark_df

    @staticmethod
    def load(directory: str,
             group: str) -> Tuple[pd.DataFrame, 
                                  pd.DataFrame, 
                                  pd.DataFrame]:
        """
        Downloads and loads GEFCom2014-P task data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.

        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y']. 
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y']. 
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2014 dataset.
        """        
        GEFCom2014.download(directory)
        
        Y_df, X_df = GEFCom2014_P.read_train_df(directory, group)
        benchmark_df = GEFCom2014_P.read_benchmark_df(directory, group)
        return Y_df, X_df, benchmark_df

In [None]:
Y_df, X_df, benchmark_df = GEFCom2014_P.load(directory='data', group='Task 15')

ds = Y_df.ds.values
y_true = Y_df.y.values

x_plot = Y_df.ds.values
x_plot_min = pd.to_datetime(x_plot.min()).strftime('%B %d, %Y')
x_plot_max = pd.to_datetime(x_plot.max()).strftime('%B %d, %Y')
x_axis_str = f'Hours [{x_plot_min}  to  {x_plot_max}]'
y_axis_str = 'Price [USD/MWh]'

fig = plt.figure(figsize=(15, 4))
fig.tight_layout()
ax0 = plt.subplot2grid((1,1),(0, 0))
axs = [ax0]

axs[0].plot(ds, y_true, color='#628793', linewidth=0.4, label='true')
axs[0].tick_params(labelsize=FONTSIZE-5)
axs[0].set_xlabel(x_axis_str, fontsize=FONTSIZE)
axs[0].set_ylabel(y_axis_str, fontsize=FONTSIZE)
plt.title('GEFCom2014-P', fontsize=FONTSIZE)
plt.grid()
plt.show()

# GEFCOM2014-W: Wind Power Generation Task Datasets
- Y: 10 target wind power series, for 10 different Australian wind farms.
- X: Wind forecasts at 10m and 100m height for the zonal (u) and meridional (v) wind components (winning submission used external data).
- Tasks: Fifteen one-month-ahead hourly wind power generation for 10 farms.

>GEFcom2014-W meta information

In [None]:
#export
@dataclass
class WindTask1:
    test_start: str = '10/01/2012'
    test_end: str = '11/01/2012'

@dataclass
class WindTask2:
    test_start: str = '11/01/2012'
    test_end: str = '12/01/2012'

@dataclass
class WindTask3:
    test_start: str = '12/01/2012'
    test_end: str = '01/01/2013'
        
@dataclass
class WindTask4:
    test_start: str = '01/01/2013'
    test_end: str = '02/01/2013'
        
@dataclass
class WindTask5:
    test_start: str = '02/01/2013'
    test_end: str = '03/01/2013'

@dataclass
class WindTask6:
    test_start: str = '03/01/2013'
    test_end: str = '04/01/2013'
        
@dataclass
class WindTask7:
    test_start: str = '04/01/2013'
    test_end: str = '05/01/2013'
    
@dataclass
class WindTask8:
    test_start: str = '05/01/2013'
    test_end: str = '06/01/2013'

@dataclass
class WindTask9:
    test_start: str = '06/01/2013'
    test_end: str = '07/01/2013'
        
@dataclass
class WindTask10:
    test_start: str = '07/01/2013'
    test_end: str = '08/01/2013'
        
@dataclass
class WindTask11:
    test_start: str = '08/01/2013'
    test_end: str = '09/01/2013'
        
@dataclass
class WindTask12:
    test_start: str = '09/01/2013'
    test_end: str = '10/01/2013'
        
@dataclass
class WindTask13:
    test_start: str = '10/01/2013'
    test_end: str = '11/01/2013'
        
@dataclass
class WindTask14:
    test_start: str = '11/01/2013'
    test_end: str = '12/01/2013'
        
@dataclass
class WindTask15:
    test_start: str = '12/01/2013'
    test_end: str = '01/01/2014'


WIND_START = '01/01/2012'
WIND_TASKS = ['Task '+str(k) for k in range(1, 16)]
GEFCom2014_W_Info = Info(groups=WIND_TASKS,
                         class_groups=[WindTask1, WindTask2, WindTask3, WindTask4,
                                       WindTask5, WindTask6, WindTask7, WindTask8,
                                       WindTask9, WindTask10, WindTask11, WindTask12,
                                       WindTask13, WindTask14, WindTask15])

In [None]:
#export
class GEFCom2014_W:
    
    @staticmethod
    def read_train_df(directory: str, group: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load train dataset.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y'].  
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y'].       
        """

        # Meta data
        path = f'{directory}/gefcom2014'
        task_number = int(re.findall("\d+", group)[0])
        group_info = GEFCom2014_W_Info.get_group(group)
        path = f'{directory}/gefcom2014'
        ydir = f'{path}/Wind/Task {task_number}/Task{task_number}_W_Zone1_10'
        xdir = f'{path}/Wind/Task {task_number}/TaskExpVars{task_number}_W_Zone1_10'
        
        # Train data
        train_dfs = []
        for zone in range(1, 11):
            yfilepath = f'{ydir}/Task{task_number}_W_Zone{zone}.csv'
            xfilepath = f'{xdir}/TaskExpVars{task_number}_W_Zone{zone}.csv'

            train_df = pd.read_csv(yfilepath, index_col=None, header=0)
            train_df.reset_index(drop=True, inplace=True)
            x_df = pd.read_csv(xfilepath, index_col=None, header=0)
            x_df['TARGETVAR'] = np.nan
            train_df = train_df.append(x_df)
            train_df['ds'] = pd.date_range(start=WIND_START,
                                           end=group_info.test_end, freq='H', closed='right')

            train_dfs.append(train_df)

        train_df = pd.concat(train_dfs, axis=0, ignore_index=True)
        train_df = train_df.rename(columns={'ZONEID': 'unique_id', 'TARGETVAR': 'y'})

        Y_df = train_df[['unique_id', 'ds', 'y']].copy()
        X_df = train_df.drop(['y', 'TIMESTAMP'], axis=1)
        return Y_df, X_df
    
    @staticmethod
    def read_benchmark_df(directory: str, group: str) -> pd.DataFrame:
        """Load benchmark time series.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2012 dataset.   
        """

        # Meta data
        path = f'{directory}/gefcom2014'
        task_number = int(re.findall("\d+", group)[0])
        group_info = GEFCom2014_W_Info.get_group(group)
        benchmarkfilepath = f'{path}/Wind/Task {task_number}/benchmark{task_number}_W.csv'
        
        # Benchmark data
        benchmark_df = pd.read_csv(benchmarkfilepath, index_col=None, header=0)
        benchmark_df.reset_index(drop=True, inplace=True)
        benchmark_df = benchmark_df.rename(columns={'ZONEID': 'unique_id'})        
        return benchmark_df
    
    @staticmethod
    def load(directory: str,
             group: str) -> Tuple[pd.DataFrame, 
                                  pd.DataFrame, 
                                  pd.DataFrame]:
        """
        Downloads and loads GEFCom2014-W task data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.

        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y']. 
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y']. 
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2014 dataset.
        """  

        GEFCom2014.download(directory)
        GEFCom2014.unzip_wind(directory)

        Y_df, X_df = GEFCom2014_W.read_train_df(directory, group)
        benchmark_df = GEFCom2014_W.read_benchmark_df(directory, group)
        return Y_df, X_df, benchmark_df

In [None]:
Y_df, X_df, benchmark_df = GEFCom2014_W.load(directory='data', group='Task 15')

Y_df = Y_df[Y_df.unique_id==2]

ds = Y_df.ds.values
y_true = Y_df.y.values

x_plot = Y_df.ds.values
x_plot_min = pd.to_datetime(x_plot.min()).strftime('%B %d, %Y')
x_plot_max = pd.to_datetime(x_plot.max()).strftime('%B %d, %Y')
x_axis_str = f'Hours [{x_plot_min}  to  {x_plot_max}]'
y_axis_str = 'Power'

fig = plt.figure(figsize=(15, 4))
fig.tight_layout()
ax0 = plt.subplot2grid((1,1),(0, 0))
axs = [ax0]

axs[0].plot(ds, y_true, color='#628793', linewidth=0.4, label='true')
axs[0].tick_params(labelsize=FONTSIZE-5)
axs[0].set_xlabel(x_axis_str, fontsize=FONTSIZE)
axs[0].set_ylabel(y_axis_str, fontsize=FONTSIZE)
plt.title('GEFCom2014-W', fontsize=FONTSIZE)
plt.grid()
plt.show()

# GEFCom2014-S: Solar Power Generation Task Datasets
- Y: 3 target solar power series, for 3 different solar power plants.
- X: 12 weather variables associated to the solar power plants.
- Tasks: Fifteen one-month-ahead hourly solar power generation for 3 power platns.

>GEFcom2014-S meta information

In [None]:
#export
@dataclass
class SolarTask1:
    test_start: str = '04/01/2013'
    test_end: str = '05/01/2013'

@dataclass
class SolarTask2:
    test_start: str = '05/01/2013'
    test_end: str = '06/01/2013'

@dataclass
class SolarTask3:
    test_start: str = '06/01/2013'
    test_end: str = '07/01/2013'
        
@dataclass
class SolarTask4:
    test_start: str = '07/01/2013'
    test_end: str = '08/01/2013'
        
@dataclass
class SolarTask5:
    test_start: str = '08/01/2013'
    test_end: str = '09/01/2013'

@dataclass
class SolarTask6:
    test_start: str = '09/01/2013'
    test_end: str = '10/01/2013'
        
@dataclass
class SolarTask7:
    test_start: str = '10/01/2013'
    test_end: str = '11/01/2013'
    
@dataclass
class SolarTask8:
    test_start: str = '11/01/2013'
    test_end: str = '12/01/2013'

@dataclass
class SolarTask9:
    test_start: str = '12/01/2013'
    test_end: str = '01/01/2014'
        
@dataclass
class SolarTask10:
    test_start: str = '01/01/2014'
    test_end: str = '02/01/2014'
        
@dataclass
class SolarTask11:
    test_start: str = '02/01/2014'
    test_end: str = '03/01/2014'
        
@dataclass
class SolarTask12:
    test_start: str = '03/01/2014'
    test_end: str = '04/01/2014'
        
@dataclass
class SolarTask13:
    test_start: str = '04/01/2014'
    test_end: str = '05/01/2014'
        
@dataclass
class SolarTask14:
    test_start: str = '05/01/2014'
    test_end: str = '06/01/2014'
        
@dataclass
class SolarTask15:
    test_start: str = '06/01/2014'
    test_end: str = '07/01/2014'


SOLAR_START = '04/01/2012'
SOLAR_TASKS = ['Task '+str(k) for k in range(1, 16)]
GEFCom2014_S_Info = Info(groups=SOLAR_TASKS,
                         class_groups=[SolarTask1, SolarTask2, SolarTask3, SolarTask4,
                                       SolarTask5, SolarTask6, SolarTask7, SolarTask8,
                                       SolarTask9, SolarTask10, SolarTask11, SolarTask12,
                                       SolarTask13, SolarTask14, SolarTask15])

In [None]:
#export
class GEFCom2014_S:
    
    @staticmethod
    def read_train_df(directory: str, group: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load train dataset.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y'].  
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y'].       
        """

        # Meta data
        path = f'{directory}/gefcom2014'
        task_number = int(re.findall("\d+", group)[0])
        group_info = GEFCom2014_S_Info.get_group(group)
        yfilepath = f'{path}/Solar/Task {task_number}/train{task_number}.csv'
        xfilepath = f'{path}/Solar/Task {task_number}/predictors{task_number}.csv'
        
        # Train data
        ds = pd.date_range(start=SOLAR_START,
                           end=group_info.test_start, freq='H', closed='right').values
        ds = np.tile(ds, 3)
        Y_df = pd.read_csv(yfilepath, index_col=None, header=0)
        Y_df.reset_index(drop=True, inplace=True)
        Y_df['ds'] = ds
        Y_df = Y_df.drop(['TIMESTAMP'], axis=1)
        Y_df = Y_df.rename(columns={'ZONEID': 'unique_id', 'POWER': 'y'})

        ds = pd.date_range(start=SOLAR_START,
                           end=group_info.test_end, freq='H', closed='right').values
        ds = np.tile(ds, 3)
        X_df = pd.read_csv(xfilepath, index_col=None, header=0)
        X_df.reset_index(drop=True, inplace=True)
        X_df['ds'] = ds
        X_df = X_df.drop(['TIMESTAMP'], axis=1)
        X_df = X_df.rename(columns={'ZONEID': 'unique_id'})
        return Y_df, X_df
    
    @staticmethod
    def read_benchmark_df(directory: str, group: str) -> pd.DataFrame:
        """Load benchmark time series.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.
 
        Returns
        -------
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2012 dataset.   
        """

        # Meta data
        group_info = GEFCom2014_S_Info.get_group(group)
        
        path = f'{directory}/gefcom2014'
        task_number = int(re.findall("\d+", group)[0])
        
        if task_number<10:
            task_number2 = '0'+str(task_number)
        else:
            task_number2 = task_number
        
        benchmarkfilepath = f'{path}/Solar/Task {task_number}/benchmark{task_number2}.csv'
        
        ds = pd.date_range(start=group_info.test_start,
                           end=group_info.test_end, freq='H', closed='right').values
        ds = np.tile(ds, 3)
        benchmark_df = pd.read_csv(benchmarkfilepath, index_col=None, header=0)
        benchmark_df.reset_index(drop=True, inplace=True)
        benchmark_df['ds'] = ds
        benchmark_df = benchmark_df.drop(['TIMESTAMP'], axis=1)
        return benchmark_df
    
    @staticmethod
    def load(directory: str,
             group: str) -> Tuple[pd.DataFrame, 
                                  pd.DataFrame, 
                                  pd.DataFrame]:
        """
        Downloads and loads GEFCom2014-W task data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'Task1', 'Task2', ..., 'Task14', 'Task15'.

        Returns
        -------
        Y_df: pd.DataFrame
            Target time series with columns ['unique_id', 'ds', 'y']. 
        X_df: pd.DataFrame
            Exogenous time series with columns ['unique_id', 'ds', 'y']. 
        benchmark_df: pd.DataFrame
            Benchmark time series form gefcom2014 dataset.
        """ 

        GEFCom2014.download(directory)

        Y_df, X_df = GEFCom2014_S.read_train_df(directory, group)
        benchmark_df = GEFCom2014_S.read_benchmark_df(directory, group)
        return Y_df, X_df, benchmark_df

In [None]:
Y_df, X_df, benchmark_df = GEFCom2014_S.load(directory='data', group='Task 15')

Y_df = Y_df[Y_df.unique_id==2]

ds = Y_df.ds.values
y_true = Y_df.y.values

x_plot = Y_df.ds.values
x_plot_min = pd.to_datetime(x_plot.min()).strftime('%B %d, %Y')
x_plot_max = pd.to_datetime(x_plot.max()).strftime('%B %d, %Y')
x_axis_str = f'Hours [{x_plot_min}  to  {x_plot_max}]'
y_axis_str = 'Power'

fig = plt.figure(figsize=(15, 4))
fig.tight_layout()
ax0 = plt.subplot2grid((1,1),(0, 0))
axs = [ax0]

axs[0].plot(ds, y_true, color='#628793', linewidth=0.4, label='true')
axs[0].tick_params(labelsize=FONTSIZE-5)
axs[0].set_xlabel(x_axis_str, fontsize=FONTSIZE)
axs[0].set_ylabel(y_axis_str, fontsize=FONTSIZE)
plt.title('GEFCom2014-S', fontsize=FONTSIZE)
plt.grid()
plt.show()