In [None]:
#| default_exp hierarchical

# Hirarchical Datasets

> Module for Hierarchical Datasets.

In [None]:
#| export
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd

from datasetsforecast.utils import download_file, Info

In [None]:
#| export
@dataclass
class Labour:
    freq: str = 'MS'
    horizon: int = 8
    horizon2: int = 12
    seasonality: int = 12
    test_size: int = 125
    tags_names: Tuple[str] = (
        'Country',
        'Country/Region',
        'Country/Gender/Region',
        'Country/Employment/Gender/Region',
    )

In [None]:
#| export
@dataclass
class TourismLarge:
    freq: str = 'MS'
    horizon: int = 12
    horizon2: int = 12
    seasonality: int = 12
    test_size: int = 57
    tags_names: Tuple[str] = (
        'Country',
        'Country/State',
        'Country/State/Zone',
        'Country/State/Zone/Region',
        'Country/Purpose',
        'Country/State/Purpose',
        'Country/State/Zone/Purpose',
        'Country/State/Zone/Region/Purpose',
    )

In [None]:
#| export
@dataclass
class TourismSmall:
    freq: str = 'Q'
    horizon: int = 4
    horizon2: int = 4
    seasonality: int = 4
    test_size: int = 9
    tags_names: Tuple[str] = (
        'Country',
        'Country/Purpose',
        'Country/Purpose/State',
        'Country/Purpose/State/CityNonCity',
    )

In [None]:
#| export
@dataclass
class Traffic:
    freq: str = 'D'
    horizon: int = 14
    horizon2: int = 7
    seasonality: int = 7
    test_size: int = 91
    tags_names: Tuple[str] = (
        'Level1',
        'Level2',
        'Level3',
        'Level4',
    )

In [None]:
#| export
@dataclass
class Wiki2:
    freq: str = 'D'
    horizon: int = 14
    horizon2: int = 7
    seasonality: int = 7
    test_size: int = 91
    tags_names: Tuple[str] = (
        'Views',
        'Views/Country',
        'Views/Country/Access',
        'Views/Country/Access/Agent',
        'Views/Country/Access/Agent/Topic'
    )

In [None]:
#| export
@dataclass
class OldTraffic:
    freq: str = 'D'
    horizon: int = 1
    horizon2: int = 1
    seasonality: int = 7
    test_size: int = 91
    tags_names: Tuple[str] = (
        'Level1',
        'Level2',
        'Level3',
        'Level4',
    )

In [None]:
#| export
@dataclass
class OldTourismLarge:
    freq: str = 'MS'
    horizon: int = 12
    horizon2: int = 12
    seasonality: int = 12
    test_size: int = 57
    tags_names: Tuple[str] = (
        'Country',
        'Country/State',
        'Country/State/Zone',
        'Country/State/Zone/Region',
        'Country/Purpose',
        'Country/State/Purpose',
        'Country/State/Zone/Purpose',
        'Country/State/Zone/Region/Purpose',
    )

In [None]:
#| export
HierarchicalInfo = Info(
    (
        Labour, TourismLarge, 
        TourismSmall,
        Traffic, Wiki2,
        OldTraffic, OldTourismLarge
    )
)

In [None]:
#| export
class HierarchicalData:
    
    source_url: str = 'https://nixtla-public.s3.amazonaws.com/hierarchical-data/datasets.zip'
    source_url_old_traffic: str ='https://www.dropbox.com/s/4nl5afkdr4djpuy/OldTraffic.zip?dl=1'
    source_url_old_tourisml: str = 'https://www.dropbox.com/s/ye78jnujhbxyggo/OldTourismLarge.zip?dl=1'

    @staticmethod
    def load(directory: str,
             group: str,
             cache: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Downloads hierarchical forecasting benchmark datasets.
        
            Parameters
            ----------
            directory: str
                Directory where data will be downloaded.
            group: str
                Group name.
            cache: bool
                If `True` saves and loads
                
            Returns
            -------
            Y_df: pd.DataFrame
                Target time series with columns ['unique_id', 'ds', 'y'].
                Containes the base time series.
            S_df: pd.DataFrame
                Summing matrix of size (hierarchies, bottom).
        """
        if group not in HierarchicalInfo.groups:
            raise Exception(f'group not found {group}')

        path = f'{directory}/hierarchical/'
        file_cache = Path(f'{path}/{group}.p')

        if file_cache.is_file() and cache:
            Y_df, S_df, tags = pd.read_pickle(file_cache)

            return Y_df, S_df, tags

        HierarchicalData.download(directory)
        path = Path(f'{path}/{group}')
        S_df = pd.read_csv(path / 'agg_mat.csv', index_col=0) 
        Y_df = pd.read_csv(path / 'data.csv', index_col=0).T
        Y_df = Y_df.stack()
        Y_df.name = 'y'
        Y_df.index = Y_df.index.set_names(['unique_id', 'ds'])
        Y_df = Y_df.reset_index()
        
        if group == 'Labour':
            #for labour we avoid covid periods
            Y_df = Y_df.query('ds < "2020-01-01"').reset_index(drop=True)
        
        if not all(Y_df['unique_id'].unique() == S_df.index):
            raise Exception('mismatch order between `Y_df` and `S_df`')
        
        def get_levels_from_S(S_df):
            cut_idxs, = np.where(S_df.sum(axis=1).cumsum() % S_df.shape[1] == 0.)
            levels = [S_df.iloc[(cut_idxs[i] + 1):(cut_idxs[i+1] + 1)].index.values for i in range(cut_idxs.size-1)]
            levels = [S_df.iloc[[0]].index.values] + levels
            assert sum([len(lv) for lv in levels]) == S_df.shape[0]
            return levels

        cls_group = HierarchicalInfo[group]
        tags = dict(zip(cls_group.tags_names, get_levels_from_S(S_df)))

        if cache:
            pd.to_pickle((Y_df, S_df, tags), file_cache)

        return Y_df, S_df, tags

    @staticmethod
    def download(directory: str) -> None:
        """
        Download Hierarchical Datasets.
        
            Parameters
            ----------
            directory: str
                Directory path to download dataset.
        """
        path = f'{directory}/hierarchical/'
        if not Path(path).exists():
            download_file(path, HierarchicalData.source_url, decompress=True)
            download_file(path, HierarchicalData.source_url_old_traffic, decompress=True)
            download_file(path, HierarchicalData.source_url_old_tourisml, decompress=True)

In [None]:
#| hide
from fastcore.test import test_close

In [None]:
#| hide
for group, _ in HierarchicalInfo:
    Y_df, S_df, tags = HierarchicalData.load('./data', group)
    assert all(S_df.loc[cats].values.sum() == S_df.shape[1] for _, cats in tags.items())
    assert len(S_df) == sum(len(v) for _, v in tags.items()), group
    S_hiers = [S_df.loc[cats].values * np.arange(1, len(cats) + 1).reshape(-1, 1) for _, cats in tags.items()]
    S_hiers = np.vstack(S_hiers)
    S_hiers = S_hiers.sum(axis=0)
    is_strictly_hierarchical = np.array_equal(S_hiers, np.sort(S_hiers))
    print(f'Is {group} strictly hierarchical? {is_strictly_hierarchical}')
    
    # test S recovers Y_df
    for key, hiers in tags.items():
        for ts, bottom_ts in S_df.loc[hiers].iterrows():
            actual_bottom_ts = bottom_ts.loc[lambda x: x == 1].index
            test_close(
                Y_df.query('unique_id == @ts')['y'].sum(), 
                Y_df.query('unique_id in @actual_bottom_ts')['y'].sum()
            )

100%|██████████| 1.30M/1.30M [00:00<00:00, 6.15MiB/s]
INFO:datasetsforecast.utils:Successfully downloaded datasets.zip, 1297274, bytes.
INFO:datasetsforecast.utils:Decompressing zip file...
INFO:datasetsforecast.utils:Successfully decompressed data/hierarchical/datasets.zip
100%|██████████| 335k/335k [00:00<00:00, 3.44MiB/s]
INFO:datasetsforecast.utils:Successfully downloaded OldTraffic.zip, 335471, bytes.
INFO:datasetsforecast.utils:Decompressing zip file...
INFO:datasetsforecast.utils:Successfully decompressed data/hierarchical/OldTraffic.zip
100%|██████████| 968k/968k [00:00<00:00, 5.56MiB/s]
INFO:datasetsforecast.utils:Successfully downloaded OldTourismLarge.zip, 967629, bytes.
INFO:datasetsforecast.utils:Decompressing zip file...
INFO:datasetsforecast.utils:Successfully decompressed data/hierarchical/OldTourismLarge.zip


Is Labour strictly hierarchical? True
Is TourismLarge strictly hierarchical? False
Is TourismSmall strictly hierarchical? True
Is Traffic strictly hierarchical? True
Is Wiki2 strictly hierarchical? True
Is OldTraffic strictly hierarchical? True
Is OldTourismLarge strictly hierarchical? False


In [None]:
#| hide
# Meta information
meta = pd.DataFrame(
    columns=['Frequency', 'Series', 'Levels', 'Observations per Series', 'Test Observations per Series', 'Horizon'],
    index=pd.Index(HierarchicalInfo.groups, name='Dataset')
)
for group, cls_group in HierarchicalInfo:
    Y_df, S, tags = HierarchicalData.load('./data', group)
    meta.loc[group, 'Frequency'] = cls_group.freq
    meta.loc[group, 'Horizon'] = cls_group.horizon
    meta.loc[group, 'Horizon2'] = int(cls_group.horizon2)
    meta.loc[group, 'Series'] = Y_df['unique_id'].nunique()
    meta.loc[group, 'Levels'] = len(tags)
    meta.loc[group, 'Observations per Series'] = Y_df.groupby('unique_id').size().unique().item()
    meta.loc[group, 'Test Observations per Series'] =  meta.loc[group, 'Observations per Series'] // 4
meta

Unnamed: 0_level_0,Frequency,Series,Levels,Observations per Series,Test Observations per Series,Horizon,Horizon2
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Labour,MS,57,4,503,125,8,12.0
TourismLarge,MS,555,8,228,57,12,12.0
TourismSmall,Q,89,4,36,9,4,4.0
Traffic,D,207,4,366,91,14,7.0
Wiki2,D,199,5,366,91,14,7.0
OldTraffic,D,207,4,366,91,1,1.0
OldTourismLarge,MS,555,8,228,57,12,12.0
