In [None]:
#| default_exp utils

# Aggregation/Visualization Utils

> The `HierarchicalForecast` package contains utility functions to wrangle and visualize 
hierarchical series datasets. The `aggregate` function of the module allows you to create
a hierarchy from categorical variables representing the structure levels, returning also
the aggregation contraints matrix $\mathbf{S}$.

In [None]:
#| export
import sys
import timeit
from itertools import chain
from typing import Callable, Dict, List, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

plt.rcParams['font.family'] = 'serif'

In [None]:
#| hide
from fastcore.test import test_eq
from nbdev.showdoc import add_docs, show_doc

In [None]:
#| exporti
class CodeTimer:
    def __init__(self, name=None, verbose=True):
        self.name = " '"  + name + "'" if name else ''
        self.verbose = verbose

    def __enter__(self):
        self.start = timeit.default_timer()

    def __exit__(self, exc_type, exc_value, traceback):
        self.took = (timeit.default_timer() - self.start)
        if self.verbose:
            print('Code block' + self.name + \
                  ' took:\t{0:.5f}'.format(self.took) + ' seconds')

In [None]:
#| exporti
def is_strictly_hierarchical(S: np.ndarray, 
                             tags: Dict[str, np.ndarray]):
    # main idea:
    # if S represents a strictly hierarchical structure
    # the number of paths before the bottom level
    # should be equal to the number of nodes
    # of the previuos level
    levels_ = dict(sorted(tags.items(), key=lambda x: len(x[1])))
    # removing bottom level
    levels_.popitem()
    # making S categorical
    hiers = [np.argmax(S[idx], axis=0) + 1 for _, idx in levels_.items()]
    hiers = np.vstack(hiers)
    paths = np.unique(hiers, axis=1).shape[1] 
    nodes = levels_.popitem()[1].size
    return paths == nodes

In [None]:
#| exporti
def cov2corr(cov, return_std=False):
    """ convert covariance matrix to correlation matrix

    **Parameters:**<br>
    `cov`: array_like, 2d covariance matrix.<br>
    `return_std`: bool=False, if True returned std.<br>

    **Returns:**<br>
    `corr`: ndarray (subclass) correlation matrix
    """
    cov = np.asanyarray(cov)
    std_ = np.sqrt(np.diag(cov))
    corr = cov / np.outer(std_, std_)
    if return_std:
        return corr, std_
    else:
        return corr

# <span style="color:DarkBlue"> Aggregate Function </span>

In [None]:
#| exporti
def _to_summing_matrix(S_df: pd.DataFrame):
    """Transforms the DataFrame `df` of hierarchies to a summing matrix S."""
    categories = [S_df[col].unique() for col in S_df.columns]
    cat_sizes = [len(cats) for cats in categories]
    idx_bottom = np.argmax(cat_sizes)
    cats_bottom = categories[idx_bottom]
    encoder = OneHotEncoder(categories=categories, sparse=False, dtype=np.float32)
    S = encoder.fit_transform(S_df).T
    S = pd.DataFrame(S, index=chain(*categories), columns=cats_bottom)
    tags = dict(zip(S_df.columns, categories))
    return S, tags

In [None]:
#| export
def aggregate(df: pd.DataFrame,
              spec: List[List[str]],
              agg_fn: Callable = np.sum):
    """Utils Aggregation Function.

    Aggregates bottom level series contained in the pd.DataFrame `df` according 
    to levels defined in the `spec` list applying the `agg_fn` (sum, mean).

    **Parameters:**<br>
    `df`: pd.DataFrame with columns `['ds', 'y']` and columns to aggregate.<br>
    `spec`: List of levels. Each element of the list contains a list of columns of `df` to aggregate.<br>
    `agg_fn`: Function used to aggregate `'y'`.<br>

    **Returns:**<br>
    `Y_df, S, tags`: tuple with hierarchically structured series `Y_df` ($\mathbf{y}_{[a,b]}$),
    summing matrix `S`, and hierarchical aggregation indexes `tags`.
    """
    max_len_idx = np.argmax([len(hier) for hier in spec])
    bottom_comb = spec[max_len_idx]
    df_hiers = []
    for hier in spec:
        df_hier = df.groupby(hier + ['ds'])['y'].apply(agg_fn).reset_index()
        df_hier['unique_id'] = df_hier[hier].agg('/'.join, axis=1)
        if hier == bottom_comb:
            bottom_hier = df_hier['unique_id'].unique()
        df_hiers.append(df_hier)
    df_hiers = pd.concat(df_hiers)
    S_df = df_hiers[['unique_id'] + bottom_comb].drop_duplicates().reset_index(drop=True)
    S_df = S_df.set_index('unique_id')
    S_df = S_df.fillna('agg')
    hiers_cols = []
    for hier in spec:
        hier_col = '/'.join(hier) 
        S_df[hier_col] = S_df[hier].agg('/'.join, axis=1)
        hiers_cols.append(hier_col)
    Y_df = df_hiers[['unique_id', 'ds', 'y']].set_index('unique_id')
    
    # Aggregations constraints S definition
    S, tags = _to_summing_matrix(S_df.loc[bottom_hier, hiers_cols])
    return Y_df, S, tags

In [None]:
show_doc(aggregate, title_level=3)

In [None]:
#| hide
df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')
df = df.rename({'Trips': 'y', 'Quarter': 'ds'}, axis=1)
df.insert(0, 'Country', 'Australia')
hiers_grouped = [['Country'],
                 ['Country', 'State'], 
                 ['Country', 'Purpose'], 
                 ['Country', 'State', 'Region'], 
                 ['Country', 'State', 'Purpose'], 
                 ['Country', 'State', 'Region', 'Purpose']]

hier_df, S, tags = aggregate(df=df, spec=hiers_grouped)
test_eq(len(hier_df), 34_000)
test_eq(hier_df.index.nunique(), 425)
test_eq(S.shape, (425, 304))
test_eq(hier_df.index.unique(), S.index)
test_eq(len(tags), len(hiers_grouped))

# <span style="color:DarkBlue"> Hierarchical Visualization </span>

In [None]:
#| export
class HierarchicalPlot:
    """ Hierarchical Plot

    This class contains a collection of matplotlib visualization methods, suited for small
    to medium sized hierarchical series.

    **Parameters:**<br>
    `S`: pd.DataFrame with summing matrix of size `(base, bottom)`, see [aggregate function](https://nixtla.github.io/hierarchicalforecast/utils.html#aggregate).<br>
    `tags`: np.ndarray, with hierarchical aggregation indexes, where 
        each key is a level and its value contains tags associated to that level.<br><br>
    """
    def __init__(self,
                 S: pd.DataFrame,
                 tags: Dict[str, np.ndarray]):
        self.S = S
        self.tags = tags

    def plot_summing_matrix(self):
        """ Summation Constraints plot
        
        This method simply plots the hierarchical aggregation
        constraints matrix $\mathbf{S}$.
        """
        plt.figure(num=1, figsize=(4, 6), dpi=80, facecolor='w')
        plt.spy(self.S)
        plt.show()
        plt.close()

    def plot_series(self,
                    series: str,
                    Y_df: Optional[pd.DataFrame] = None,
                    models: Optional[List[str]] = None,
                    level: Optional[List[int]] = None):
        """ Single Series plot

        **Parameters:**<br>
        `series`: str, string identifying the `'unique_id'` any-level series to plot.<br>
        `Y_df`: pd.DataFrame, hierarchically structured series ($\mathbf{y}_{[a,b]}$). 
                It contains columns `['unique_id', 'ds', 'y']`, it may have `'models'`.<br>
        `models`: List[str], string identifying filtering model columns.
        `level`: float list 0-100, confidence levels for prediction intervals available in `Y_df`.<br>

        **Returns:**<br>
        Single series plot with filtered models and prediction interval level.<br><br>
        """
        if series not in self.S.index:
            raise Exception(f'time series {series} not found')
        fig, ax = plt.subplots(1, 1, figsize = (20, 7))
        df_plot = Y_df.loc[series].set_index('ds')
        cols = models if models is not None else df_plot.columns
        cols_wo_levels = [col for col in cols if ('lo' not in col and 'hi' not in col)]
        cmap = plt.cm.get_cmap("tab10", 10)
        cmap = [cmap(i) for i in range(10)][:len(cols_wo_levels)]
        cmap_dict = dict(zip(cols_wo_levels, cmap))
        df_plot[cols_wo_levels].plot(ax=ax, linewidth=2, color=cmap)
        if level is not None:
            for lv in level:
                for col in cols_wo_levels:
                    if col == 'y':
                        # we dont need intervals
                        # for the actual value
                        continue
                    if f'{col}-lo-{lv}' not in df_plot.columns:
                        # if model
                        # doesnt have levels
                        continue
                    ax.fill_between(
                        df_plot.index, 
                        df_plot[f'{col}-lo-{lv}'], 
                        df_plot[f'{col}-hi-{lv}'],
                        alpha=-lv/100 + 1,
                        color=cmap_dict[col],
                        label=f'{col}_level_{lv}'
                    )
        ax.set_title(f'{series} Forecast', fontsize=22)
        ax.set_xlabel('Timestamp [t]', fontsize=20)
        ax.legend(prop={'size': 15})
        ax.grid()
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(20)
                    
    def plot_hierarchically_linked_series(self,
                                          bottom_series: str,
                                          Y_df: Optional[pd.DataFrame] = None,
                                          models: Optional[List[str]] = None,
                                          level: Optional[List[int]] = None):
        """ Hierarchically Linked Series plot

        **Parameters:**<br>
        `bottom_series`: str, string identifying the `'unique_id'` bottom-level series to plot.<br>
        `Y_df`: pd.DataFrame, hierarchically structured series ($\mathbf{y}_{[a,b]}$). 
                It contains columns ['unique_id', 'ds', 'y'] and models. <br>
        `models`: List[str], string identifying filtering model columns.
        `level`: float list 0-100, confidence levels for prediction intervals available in `Y_df`.<br>

        **Returns:**<br>
        Collection of hierarchilly linked series plots associated with the `bottom_series`
        and filtered models and prediction interval level.<br><br>
        """
        if bottom_series not in self.S.columns:
            raise Exception(f'bottom time series {bottom_series} not found')
        linked_series = self.S[bottom_series].loc[lambda x: x == 1.].index
        fig, axs = plt.subplots(len(linked_series), 1, figsize=(20, 2 * len(linked_series)))
        cols = models if models is not None else Y_df.drop(['ds'], axis=1)
        cols_wo_levels = [col for col in cols if ('lo' not in col and 'hi' not in col)]
        cmap = plt.cm.get_cmap("tab10", 10)
        cmap = [cmap(i) for i in range(10)][:len(cols_wo_levels)]
        cmap_dict = dict(zip(cols_wo_levels, cmap))
        for idx, series in enumerate(linked_series):
            df_plot = Y_df.loc[[series]].set_index('ds')
            df_plot[cols_wo_levels].plot(ax=axs[idx], linewidth=2, color=cmap)
            if level is not None:
                for lv in level:
                    for col in cols_wo_levels:
                        if col == 'y':
                            # we dont need intervals
                            # for the actual value
                            continue
                        if f'{col}-lo-{lv}' not in df_plot.columns:
                            # if model
                            # doesnt have levels
                            continue
                        axs[idx].fill_between(
                            df_plot.index, 
                            df_plot[f'{col}-lo-{lv}'], 
                            df_plot[f'{col}-hi-{lv}'],
                            alpha=-lv/100 + 1,
                            color=cmap_dict[col],
                            label=f'{col}_level_{lv}'
                        )
            axs[idx].set_title(f'{series}', fontsize=10)
            axs[idx].grid()
            axs[idx].get_xaxis().label.set_visible(False)
            axs[idx].legend().set_visible(False)
            for label in (axs[idx].get_xticklabels() + axs[idx].get_yticklabels()):
                label.set_fontsize(10)
        plt.subplots_adjust(hspace=0.4)
        handles, labels = axs[0].get_legend_handles_labels()
        kwargs = dict(loc='lower center', 
                      prop={'size': 10}, 
                      bbox_to_anchor=(0, 0.05, 1, 1))
        if sys.version_info.minor > 7:
            kwargs['ncols'] = np.max([2, np.ceil(len(labels) / 2)])
        fig.legend(handles, labels, **kwargs)

    def plot_hierarchical_predictions_gap(self,
                                          Y_df: pd.DataFrame,
                                          models: Optional[List[str]] = None,
                                          xlabel: Optional=None,
                                          ylabel: Optional=None,
                                          ):
        """ Hierarchically Predictions Gap plot

        **Parameters:**<br>
        `Y_df`: pd.DataFrame, hierarchically structured series ($\mathbf{y}_{[a,b]}$). 
                It contains columns ['unique_id', 'ds', 'y'] and models. <br>
        `models`: List[str], string identifying filtering model columns.
        `xlabel`: str, string for the plot's x axis label.
        `ylable`: str, string for the plot's y axis label.

        **Returns:**<br>
        Plots of aggregated predictions at different levels of the hierarchical structure.
        The aggregation is performed according to the tag levels see 
        [aggregate function](https://nixtla.github.io/hierarchicalforecast/utils.html).<br><br>
        """
        # Parse predictions dataframe
        horizon_dates = Y_df['ds'].unique()
        cols = models if models is not None else Y_df.drop(['ds', 'y'], axis=1).columns
        
        # Plot predictions across tag levels
        fig, ax = plt.subplots(figsize=(8, 5))
        
        if 'y' in Y_df.columns:
            idx_top = self.S.sum(axis=1).idxmax()
            y_plot = Y_df.loc[idx_top].y.values
            plt.plot(horizon_dates, y_plot, label='True')

        ys = []
        for tag in self.tags:
            y_plot = sum([Y_df[cols].loc[Y_df.index == idx].values \
                          for idx in self.tags[tag]])
            plt.plot(horizon_dates, y_plot, label=f'Level: {tag}')
            
            ys.append(y_plot[:,None])

        plt.title('Predictions Accumulated Difference')
        if ylabel is not None:
            plt.ylabel(ylabel)
        if xlabel is not None:
            plt.xlabel(xlabel)

        plt.legend()
        plt.grid()
        plt.show()

In [None]:
show_doc(HierarchicalPlot, title_level=3)

In [None]:
show_doc(HierarchicalPlot.plot_summing_matrix, 
         name='plot_summing_matrix', title_level=3)

In [None]:
show_doc(HierarchicalPlot.plot_series, 
         name='plot_series', title_level=3)

In [None]:
show_doc(HierarchicalPlot.plot_hierarchically_linked_series, 
         name='plot_hierarchically_linked_series', title_level=3)

In [None]:
show_doc(HierarchicalPlot.plot_hierarchical_predictions_gap,
         name='plot_hierarchical_predictions_gap', title_level=3)

In [None]:
#| hide
hplots = HierarchicalPlot(S=S, tags=tags)
hplots.plot_summing_matrix()

In [None]:
#| hide
hier_df['Model'] = hier_df['y'] * 1.1
hier_df['Model-lo-80'] = hier_df['Model'] - 0.1 * hier_df['Model']
hier_df['Model-hi-80'] = hier_df['Model'] + 0.1 * hier_df['Model']
hier_df['Model-lo-90'] = hier_df['Model'] - 0.2 * hier_df['Model']
hier_df['Model-hi-90'] = hier_df['Model'] + 0.2 * hier_df['Model']
hplots.plot_series(
    series='Australia', 
    Y_df=hier_df,
    level=[80, 90]
)

In [None]:
#| hide
hplots.plot_series(series='Australia', 
                   Y_df=hier_df)

In [None]:
#| hide
hplots.plot_hierarchically_linked_series(
    bottom_series='Australia/Western Australia/Experience Perth/Visiting', 
    Y_df=hier_df,
    level=[80, 90]
)

In [None]:
#| hide
hplots.plot_hierarchically_linked_series(
    bottom_series='Australia/Western Australia/Experience Perth/Visiting', 
    Y_df=hier_df,
)

In [None]:
#| hide
# test series with just one value
hplots.plot_hierarchically_linked_series(
    bottom_series='Australia/Western Australia/Experience Perth/Visiting', 
    Y_df=hier_df.groupby('unique_id').tail(1),
)

In [None]:
#| hide
hplots.plot_hierarchical_predictions_gap(Y_df=hier_df.drop(columns='y'), models=['Model'])

In [None]:
#| eval: false
from statsforecast.core import StatsForecast
from statsforecast.models import AutoARIMA, ETS, Naive
from datasetsforecast.hierarchical import HierarchicalData

Y_df, S, tags = HierarchicalData.load('./data', 'Labour')
Y_df['ds'] = pd.to_datetime(Y_df['ds'])

Y_test_df  = Y_df.groupby('unique_id').tail(24)
Y_train_df = Y_df.drop(Y_test_df.index)
Y_test_df  = Y_test_df.set_index('unique_id')
Y_train_df = Y_train_df.set_index('unique_id')

fcst = StatsForecast(
    df=Y_train_df, 
    #models=[AutoARIMA(season_length=12), Naive()], 
    models=[ETS(season_length=12, model='AAZ')],
    freq='MS', 
    n_jobs=-1
)
Y_hat_df = fcst.forecast(h=24)

# Plot prediction difference of different aggregation
# Levels Country, Country/Region, Country/Gender/Region ...
hplots = HierarchicalPlot(S=S, tags=tags)

hplots.plot_hierarchical_predictions_gap(
    Y_df=Y_hat_df, models='ETS',
    xlabel='Month', ylabel='Predictions',
)