In [None]:
#| default_exp evaluation

# Hierarchical Evaluation 

> Module for Hierarchical Evaluation.

In [None]:
#| export
from inspect import signature
from typing import Callable, Dict, List, Optional

import numpy as np
import pandas as pd

In [None]:
#| hide
from fastcore.test import test_close, test_fail
from nbdev.showdoc import add_docs, show_doc

In [None]:
#| export
class HierarchicalEvaluation:
    """Hierarchical Evaluation Class.
    [Source code](https://github.com/dluuo/hierarchicalforecast/blob/main/hierarchicalforecast/evaluation.py).

    **Parameters:**<br>
    `evaluators`: functions with arguments `y`, `y_hat`.<br>
    """
    def __init__(
            self, 
            evaluators: List[Callable] 
        ):
        self.evaluators = evaluators
        
    def evaluate(
            self, 
            Y_h: pd.DataFrame, 
            Y_test: pd.DataFrame, 
            tags: Dict[str, np.ndarray], 
            Y_df: Optional[pd.DataFrame] = None,
            benchmark: Optional[str] = None 
        ):
        """Hierarchical Evaluation Method.
        [Source code](https://github.com/dluuo/hierarchicalforecast/blob/main/hierarchicalforecast/evaluation.py).

        **Parameters:**<br>
        `Y_h`: Forecasts with columns `['ds']` and models to evaluate.<br>
        `Y_test`: True values with columns `['ds', 'y']`.<br>
        `tags`: Each key is a level and its value contains tags associated to that level.<br>
        `Y_df`: Training set of base time series with columns `['ds', 'y']` indexed by `unique_id`.<br>
        `benchmark`: If passed, evaluators are scaled by the error of this benchark.<br>
        
        **Returns:**<br>
        `evaluation`: pd.DataFrame with accuracy measurements across hierarchical levels.
        """
        drop_cols = ['ds', 'y'] if 'y' in Y_h.columns else ['ds']
        h = len(Y_h.loc[Y_h.index[0]])
        model_names = Y_h.drop(columns=drop_cols, axis=1).columns.to_list()
        fn_names = [fn.__name__ for fn in self.evaluators]
        has_y_insample = any(['y_insample' in signature(fn).parameters for fn in self.evaluators])
        if has_y_insample and Y_df is None:
            raise Exception('At least one evaluator needs y insample, please pass `Y_df`')
        if benchmark is not None:
            fn_names = [f'{fn_name}-scaled' for fn_name in fn_names]
        tags_ = {'Overall': np.concatenate(list(tags.values()))}
        tags_ = {**tags_, **tags}
        index = pd.MultiIndex.from_product([tags_.keys(), fn_names], names=['level', 'metric'])
        evaluation = pd.DataFrame(columns=model_names, index=index)
        for level, cats in tags_.items():
            Y_h_cats = Y_h.loc[cats]
            y_test_cats = Y_test.loc[cats, 'y'].values.reshape(-1, h)
            if has_y_insample:
                y_insample = Y_df.pivot(columns='ds', values='y').loc[cats].values
            for i_fn, fn in enumerate(self.evaluators):
                if 'y_insample' in signature(fn).parameters:
                    kwargs = {'y_insample': y_insample}
                else:
                    kwargs = {}
                fn_name = fn_names[i_fn]
                for model in model_names:
                    loss = fn(y_test_cats, Y_h_cats[model].values.reshape(-1, h), **kwargs)
                    if benchmark is not None:
                        scale = fn(y_test_cats, Y_h_cats[benchmark].values.reshape(-1, h), **kwargs)
                        if np.isclose(scale, 0., atol=np.finfo(float).eps):
                            scale += np.finfo(float).eps
                            if np.isclose(scale, loss, atol=1e-8):
                                scale = 1.
                        loss /= scale
                    evaluation.loc[(level, fn_name), model] = loss
        return evaluation

In [None]:
#| hide
add_docs(HierarchicalEvaluation, "Evaluate reconciliation methods.",
         evaluate="Evaluate reconciliation methods for distinct levels.")

In [None]:
show_doc(HierarchicalEvaluation)

In [None]:
show_doc(HierarchicalEvaluation.evaluate)

You can use your own metrics to evaluate the performance of each level in the structure. The metrics receive `y` and `y_hat` as arguments and they are numpy arrays of size `(series, horizon)`. Consider, for example, the function `rmse` that calculates the root mean squared error.

In [None]:
def rmse(y, y_hat):
    return np.mean(np.sqrt(np.mean((y-y_hat)**2, axis=1)))

Additionally, you can use functions based on insample values, such as `mase` (mean absolute scaled error). In this case you have to include the argument `y_insample` (of size `(series, insample_size)`) in your function. For example,

In [None]:
def mase(y, y_hat, y_insample, seasonality=4):
    errors = np.mean(np.abs(y - y_hat), axis=1)
    scale = np.mean(np.abs(y_insample[:, seasonality:] - y_insample[:, :-seasonality]), axis=1)
    return np.mean(errors / scale)

In [None]:
#| hide
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.methods import BottomUp, MinTrace, ERM
from hierarchicalforecast.utils import aggregate
df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')
df = df.rename({'Trips': 'y', 'Quarter': 'ds'}, axis=1)
df.insert(0, 'Country', 'Australia')

# non strictly hierarchical structure
hiers_grouped = [
    ['Country'],
    ['Country', 'State'], 
    ['Country', 'Purpose'], 
    ['Country', 'State', 'Region'], 
    ['Country', 'State', 'Purpose'], 
    ['Country', 'State', 'Region', 'Purpose']
]
# strictly hierarchical structure
hiers_strictly = [
    ['Country'],
    ['Country', 'State'], 
    ['Country', 'State', 'Region'], 
]

# getting df
hier_grouped_df, S_grouped, tags_grouped = aggregate(df, hiers_grouped)

#split train/test
hier_grouped_df['y_model'] = hier_grouped_df['y']
# we should be able to recover y using the methods
hier_grouped_df_h = hier_grouped_df.groupby('unique_id').tail(12)
ds_h = hier_grouped_df_h['ds'].unique()
hier_grouped_df = hier_grouped_df.query('~(ds in @ds_h)')
#adding noise to `y_model` to avoid perfect fited values
hier_grouped_df['y_model'] += np.random.uniform(-1, 1, len(hier_grouped_df))

#hierachical reconciliation
hrec = HierarchicalReconciliation(reconcilers=[
    #these methods should reconstruct the original y
    BottomUp(),
    MinTrace(method='ols'),
    MinTrace(method='wls_struct'),
    MinTrace(method='wls_var'),
    MinTrace(method='mint_shrink'),
    # ERM recovers but needs bigger eps
    ERM(method='reg_bu', lambda_reg=None),
])
reconciled = hrec.reconcile(hier_grouped_df_h, hier_grouped_df, S_grouped, tags_grouped)

In [None]:
#| hide
def mse(y, y_hat):
    return np.mean((y-y_hat)**2)
def rmse(y, y_hat):
    return np.sqrt(mse(y, y_hat))
evaluator = HierarchicalEvaluation([mse, rmse])
evaluator.evaluate(Y_h=reconciled.drop(columns='y'), 
                   Y_test=reconciled[['ds', 'y']], 
                   tags=tags_grouped,
                   benchmark='y_model')

In [None]:
#| hide
def mase(y, y_hat, y_insample, seasonality=4):
    errors = np.mean(np.abs(y - y_hat), axis=1)
    scale = np.mean(np.abs(y_insample[:, seasonality:] - y_insample[:, :-seasonality]), axis=1)
    return np.mean(errors / scale)
evaluator = HierarchicalEvaluation([mase])
evaluator.evaluate(Y_h=reconciled.drop(columns='y'), 
                   Y_test=reconciled[['ds', 'y']], 
                   tags=tags_grouped,
                   Y_df=hier_grouped_df,
                   benchmark='y_model')