In [None]:
#default_exp data.datasets.epf

# EPF dataset

> API details.

In [None]:
#export
from dataclasses import dataclass
from datetime import timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from pandas.tseries.frequencies import to_offset

from nixtla.data.datasets.utils import download_file, Info, TimeSeriesDataclass

In [None]:
#export
SOURCE_URL = 'https://sandbox.zenodo.org/api/files/da5b2c6f-8418-4550-a7d0-7f2497b40f1b/'

Tourism meta information

In [None]:
#export
@dataclass
class NP:
    test_date: str = '2016-12-27'
    name: str = 'NP'
        
@dataclass
class PJM:
    test_date: str = '2016-12-27'
    name: str = 'PJM'
        
@dataclass
class BE:
    test_date: str = '2015-01-04'
    name: str = 'BE'
        
@dataclass
class FR:
    test_date: str = '2015-01-04'
    name: str = 'FR'
        
@dataclass
class DE:
    test_date: str = '2016-01-04'
    name: str = 'DE'

In [None]:
#export
EPFInfo = Info(groups=('NP', 'PJM', 'BE', 'FR', 'DE'),
               class_groups=(NP, PJM, BE, FR, DE))

In [None]:
#export
class EPF(TimeSeriesDataclass):

    @staticmethod
    def load(directory: str,
             group: str,
             training: bool = True,
             days_in_test: int = 728) -> 'EPF':
        """
        Downloads and loads EPF data.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        group: str
            Group name.
            Allowed groups: 'NP', 'PJM', 'BE', 'FR', 'DE'.
        training: bool
            Wheter return training or testing data. Default True.
        days_in_test: int
            Number of days to consider in test.
            Only used when training=True.
        """
        path = Path(directory) / 'epf' / 'datasets'
        
        EPF.download(directory)

        class_group = EPFInfo.get_group(group)

        file = path / f'{group}.csv'

        df = pd.read_csv(file)
        
        df.columns = ['ds', 'y'] + \
                     [f'Exogenous{i}' for i in range(1, len(df.columns) - 1)]
        
        df['unique_id'] = group
        df['ds'] = pd.to_datetime(df['ds'])
        df['week_day'] = df['ds'].dt.dayofweek
        
        dummies = pd.get_dummies(df['week_day'], prefix='day')
        df = pd.concat([df, dummies], axis=1)
        
        dummies_cols = [col for col in df \
                        if (col.startswith('day') or col.startswith('hour_'))]
        
        if training:
            df = df.query('ds < @class_group.test_date')
        else:
            last_date_test = pd.to_datetime(class_group.test_date) + \
                             timedelta(days=days_in_test)
            df = df.query('ds >= @class_group.test_date')
            
        Y = df.filter(items=['unique_id', 'ds', 'y'])
        X = df.filter(items=['unique_id', 'ds', 'Exogenous1', 'Exogenous2'] + \
                      dummies_cols)

        return EPF(Y=Y, S=None, X=X, group=group)
    
    @staticmethod
    def load_groups(directory: str,
                    groups: List[str] = ['BE', 'FR'],
                    training: bool = True,
                    days_in_test: int = 728) -> 'EPF':
        """
        Downloads and loads panel of EPF data
        according of groups.

        Parameters
        ----------
        directory: str
            Directory where data will be downloaded.
        groups: List[str]
            Group names.
            Allowed groups: 'NP', 'PJM', 'BE', 'FR', 'DE'.
        training: bool
            Wheter return training or testing data. Default True.
        days_in_test: int
            Number of days to consider in test.
            Only used when training=True.
        """
        Y = []
        X = []
        for group in groups:
            data = EPF.load(directory, group, 
                            training, days_in_test)
            Y.append(data.Y)
            X.append(data.X)
        Y = pd.concat(Y).sort_values(['unique_id', 'ds']).reset_index(drop=True)
        X = pd.concat(X).sort_values(['unique_id', 'ds']).reset_index(drop=True)
        
        S = Y[['unique_id']].drop_duplicates().reset_index(drop=True)
        dummies = pd.get_dummies(S['unique_id'], prefix='static')
        S = pd.concat([S, dummies], axis=1)
        
        return EPF(Y=Y, X=X, S=S, group=groups)

    @staticmethod
    def download(directory: str) -> None:
        """Downloads EPF Dataset."""
        path = Path(directory) / 'epf' / 'datasets'
        if not path.exists():
            for group in EPFInfo.groups:
                download_file(path, SOURCE_URL + f'{group}.csv')

## Cargar un grupo específico

In [None]:
pjm = EPF.load('data', group='PJM')
pjm_test = EPF.load('data', group='PJM', training=False)

In [None]:
print('Y train: \n', pjm.Y.head(2)) 
print('\nX train: \n', pjm.X.head(2))

Y train: 
   unique_id                  ds          y
0       PJM 2013-01-01 00:00:00  25.464211
1       PJM 2013-01-01 01:00:00  23.554578

X train: 
   unique_id                  ds  Exogenous1  Exogenous2  day_0  day_1  day_2  \
0       PJM 2013-01-01 00:00:00     85049.0     11509.0      0      1      0   
1       PJM 2013-01-01 01:00:00     82128.0     10942.0      0      1      0   

   day_3  day_4  day_5  day_6  
0      0      0      0      0  
1      0      0      0      0  


In [None]:
print('Y test: \n', pjm_test.Y.head(2)) 
print('\nX test: \n', pjm_test.X.head(2))

Y test: 
       unique_id                  ds          y
34944       PJM 2016-12-27 00:00:00  19.113045
34945       PJM 2016-12-27 01:00:00  18.042768

X test: 
       unique_id                  ds  Exogenous1  Exogenous2  day_0  day_1  \
34944       PJM 2016-12-27 00:00:00     74616.0     10214.0      0      1   
34945       PJM 2016-12-27 01:00:00     71821.0      9702.0      0      1   

       day_2  day_3  day_4  day_5  day_6  
34944      0      0      0      0      0  
34945      0      0      0      0      0  


## Cargar más de un grupo a la vez

In [None]:
groups = ['PJM', 'NP', 'FR', 'BE', 'DE']
pjm = EPF.load_groups('data', groups=groups)
pjm_test = EPF.load_groups('data', groups=groups)

In [None]:
print('Y train: \n', pjm.Y.head(2)) 
print('\nX train: \n', pjm.X.head(2))
print('\nS train: \n', pjm.S.head(2))

Y train: 
   unique_id                  ds      y
0        BE 2011-01-09 00:00:00  32.54
1        BE 2011-01-09 01:00:00  21.55

X train: 
   unique_id                  ds  Exogenous1  Exogenous2  day_0  day_1  day_2  \
0        BE 2011-01-09 00:00:00     63065.0     63000.0      0      0      0   
1        BE 2011-01-09 01:00:00     62715.0     58800.0      0      0      0   

   day_3  day_4  day_5  day_6  
0      0      0      0      1  
1      0      0      0      1  

S train: 
   unique_id  static_BE  static_DE  static_FR  static_NP  static_PJM
0        BE          1          0          0          0           0
1        DE          0          1          0          0           0


In [None]:
print('Y test: \n', pjm_test.Y.head(2)) 
print('\nX test: \n', pjm_test.X.head(2))
print('\nS test: \n', pjm_test.S.head(2))

Y test: 
   unique_id                  ds      y
0        BE 2011-01-09 00:00:00  32.54
1        BE 2011-01-09 01:00:00  21.55

X test: 
   unique_id                  ds  Exogenous1  Exogenous2  day_0  day_1  day_2  \
0        BE 2011-01-09 00:00:00     63065.0     63000.0      0      0      0   
1        BE 2011-01-09 01:00:00     62715.0     58800.0      0      0      0   

   day_3  day_4  day_5  day_6  
0      0      0      0      1  
1      0      0      0      1  

S test: 
   unique_id  static_BE  static_DE  static_FR  static_NP  static_PJM
0        BE          1          0          0          0           0
1        DE          0          1          0          0           0
