In [1]:
#default_exp data.datasets.epf

# EPF dataset

> API details.

In [2]:
#export
from dataclasses import dataclass
from datetime import timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset

from nixtla.data.datasets.utils import download_file, Info, TimeSeriesDataclass
from nixtla.data.tsdataset import TimeSeriesDataset


In [3]:
#export
SOURCE_URL = 'https://sandbox.zenodo.org/api/files/da5b2c6f-8418-4550-a7d0-7f2497b40f1b/'

Tourism meta information

In [4]:
#export
@dataclass
class NP:
    test_date: str = '2016-12-27'
    name: str = 'NP'

@dataclass
class PJM:
    test_date: str = '2016-12-27'
    name: str = 'PJM'

@dataclass
class BE:
    test_date: str = '2015-01-04'
    name: str = 'BE'

@dataclass
class FR:
    test_date: str = '2015-01-04'
    name: str = 'FR'

@dataclass
class DE:
    test_date: str = '2016-01-04'
    name: str = 'DE'

In [5]:
#export
EPFInfo = Info(groups=('NP', 'PJM', 'BE', 'FR', 'DE'),
               class_groups=(NP, PJM, BE, FR, DE))

In [6]:
#export
class EPF:

    #@staticmethod
    def load(directory: str,
             group: str,
             training: bool = True,
             days_in_test: int = 728,
             return_tensor: bool = True): # -> Union[TimeSeriesDataset, TimeSeriesDataclass]
        """
        Downloads and loads EPF data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'NP', 'PJM', 'BE', 'FR', 'DE'.
        training: bool
            Wheter return training or testing data. Default True.
        days_in_test: int
            Number of days to consider in test.
            Only used when training=True.
        return_tensor: bool
            Wheter return TimeSeriesDataset (tensors, True) or
            TimeSeriesDataclass (dataframes)
        """
        path = Path(directory) / 'epf' / 'datasets'

        EPF.download(directory)

        class_group = EPFInfo.get_group(group)

        file = path / f'{group}.csv'

        df = pd.read_csv(file)

        df.columns = ['ds', 'y'] + \
                     [f'Exogenous{i}' for i in range(1, len(df.columns) - 1)]

        df['unique_id'] = group
        df['ds'] = pd.to_datetime(df['ds'])
        df['week_day'] = df['ds'].dt.dayofweek

        dummies = pd.get_dummies(df['week_day'], prefix='day')
        df = pd.concat([df, dummies], axis=1)

        dummies_cols = [col for col in df \
                        if (col.startswith('day') or col.startswith('hour_'))]

        if training:
            df = df.query('ds < @class_group.test_date')
        else:
            last_date_test = pd.to_datetime(class_group.test_date) + \
                             timedelta(days=days_in_test)
            df = df.query('ds >= @class_group.test_date')

        Y = df.filter(items=['unique_id', 'ds', 'y'])
        X = df.filter(items=['unique_id', 'ds', 'Exogenous1', 'Exogenous2', 'week_day'] + \
                      dummies_cols)
        
    #def get_data(self):
        return Y, X
        # if return_tensor:
        #     return TimeSeriesDataset(y_df=Y, X_s_df=None, X_t_df=X, ts_train_mask=ts_train_mask)
        # else:
        #     return TimeSeriesDataclass(Y=Y, S=None, X=X, group=group)

    @staticmethod
    def load_groups(directory: str,
                    groups: List[str] = ['BE', 'FR'],
                    training: bool = True,
                    days_in_test: int = 728,
                    return_tensor: bool = True) -> Union[TimeSeriesDataset, TimeSeriesDataclass]:
        """
        Downloads and loads panel of EPF data
        according of groups.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        groups: List[str]
            Group names.
            Allowed groups: 'NP', 'PJM', 'BE', 'FR', 'DE'.
        training: bool
            Wheter return training or testing data. Default True.
        days_in_test: int
            Number of days to consider in test.
            Only used when training=True.
        return_tensor: bool
            Wheter return TimeSeriesDataset (tensors, True) or
            TimeSeriesDataclass (dataframes)
        """
        Y = []
        X = []
        for group in groups:
            Y_df, X_df = EPF.load(directory, group,
                                  training, days_in_test,
                                  return_tensor=False)
            Y.append(Y_df)
            X.append(X_df)
        Y = pd.concat(Y).sort_values(['unique_id', 'ds']).reset_index(drop=True)
        X = pd.concat(X).sort_values(['unique_id', 'ds']).reset_index(drop=True)

        S = Y[['unique_id']].drop_duplicates().reset_index(drop=True)
        dummies = pd.get_dummies(S['unique_id'], prefix='static')
        S = pd.concat([S, dummies], axis=1)
        
        # if return_tensor:
        return Y, X, S
        #     return TimeSeriesDataset(y_df=Y, X_s_df=None, X_t_df=X)
        # else:
        #     return TimeSeriesDataclass(Y=Y, S=S, X=X, group=groups)

    @staticmethod
    def download(directory: str) -> None:
        """Downloads EPF Dataset."""
        path = Path(directory) / 'epf' / 'datasets'
        if not path.exists():
            for group in EPFInfo.groups:
                download_file(path, SOURCE_URL + f'{group}.csv')

## Load specific group

In [None]:
args = pd.Series({'dataset': 'NP'})

Y_df, Xt_df = EPF.load(directory='data', group=args.dataset)

# train_mask: 1 to keep, 0 to mask
offset = 365 * 24 * 2
train_outsample_mask = np.ones(len(Y_df))
train_outsample_mask[-offset:] = 0

print(f'Dataset: {args.dataset}')
#print("Xt_df.columns", Xt_df.columns)
print(f'Train mask percentage: {np.round(np.sum(train_outsample_mask)/len(train_outsample_mask),2)}')
print('X: time series features, of shape (#hours, #times,#features): \t' + str(Xt_df.shape))
print('Y: target series (in X), of shape (#hours, #times): \t \t' + str(Y_df.shape))
print(f'Last ds {Y_df.ds.max()}')
print(f'Train {sum(1-train_outsample_mask)} hours = {np.round(sum(1-train_outsample_mask)/(24*365),2)} years')
print(f'Validation {sum(train_outsample_mask)} hours = {np.round(sum(train_outsample_mask)/(24*365),2)} years')
# print('S: static features, of shape (#series,#features): \t \t' + str(S.shape))
#Y_df.head()
print('\n')

## Load all groups

In [88]:
def get_last_n_hours(Y_df, n_hours):
    if 'last' in Y_df.columns:
        del Y_df['last']

    last_df = Y_df.copy()[['unique_id', 'ds']]
    last_df.sort_values(by=['unique_id', 'ds'], inplace=True, ascending=False)
    last_df.reset_index(drop=True, inplace=True)

    last_df = last_df.groupby('unique_id').head(n_hours)#.reset_index(drop=True)
    last_df['last'] = 1

    last_df = last_df[['unique_id', 'ds', 'last']]

    Y_df = Y_df.merge(last_df, on=['unique_id', 'ds'], how='left')
    Y_df['last'] = Y_df['last'].fillna(0)

    Y_df.sort_values(by=['unique_id', 'ds'], inplace=True)
    
    return Y_df

In [89]:
val_ds = 2 * 365
args = pd.Series({'dataset': ['NP', 'PJM', 'BE', 'FR']})

Y_df, Xt_df, S_df = EPF.load_groups(directory='data', groups=args.dataset)

# train_mask: 1 to keep, 0 to mask
# Y_df = get_last_n_days(Y_df, n_days=val_ds)
Y_df = get_last_n_hours(Y_df, n_hours=val_ds*24)
train_outsample_mask = Y_df['last'].values

print(f'Dataset: {args.dataset}')
#print("Xt_df.columns", Xt_df.columns)
print('X: time series features, of shape (#hours, #times,#features): \t' + str(Xt_df.shape))
print('S: static features, of shape (#series,#features+unique_id): \t' + str(S_df.shape))
print('Y: target series (in X), of shape (#hours, #times): \t \t' + str(Y_df.shape))
print("\n")

print("Train Validation splits")
print(last_df.groupby(['last', 'unique_id']).ds.max())
print(last_df.groupby(['last', 'unique_id']).ds.min())
print(f'Train insample percentage {np.round(sum(1-train_outsample_mask)/len(Y_df),2)}, \
        {sum(1-train_outsample_mask)} hours = {np.round(sum(1-train_outsample_mask)/(24*365),2)} years')
print(f'Train outsample percentage {np.round(sum(train_outsample_mask)/len(Y_df),2)}, \
        {sum(train_outsample_mask)} hours = {np.round(sum(train_outsample_mask)/(24*365),2)} years')
#Y_df.head()
print('\n')

Dataset: ['NP', 'PJM', 'BE', 'FR']
X: time series features, of shape (#hours, #times,#features): 	(139776, 12)
S: static features, of shape (#series,#features+unique_id): 	(4, 5)
Y: target series (in X), of shape (#hours, #times): 	 	(139776, 4)


Train Validation splits
last  unique_id
0.0   BE          2013-01-03 23:00:00
      FR          2013-01-03 23:00:00
      NP          2014-12-27 23:00:00
      PJM         2014-12-27 23:00:00
1.0   BE          2015-01-03 23:00:00
      FR          2015-01-03 23:00:00
      NP          2016-12-26 23:00:00
      PJM         2016-12-26 23:00:00
Name: ds, dtype: datetime64[ns]
last  unique_id
0.0   BE          2011-01-09
      FR          2011-01-09
      NP          2013-01-01
      PJM         2013-01-01
1.0   BE          2013-01-04
      FR          2013-01-04
      NP          2014-12-28
      PJM         2014-12-28
Name: ds, dtype: datetime64[ns]
Train insample percentage 0.5,         69696.0 hours = 7.96 years
Train outsample percentage 0.5