In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp evaluation

# Evaluation
> Model performance evaluation

In [None]:
#| export
import inspect
import re
import reprlib
from typing import Callable, Dict, List, Optional, get_origin

import numpy as np
import pandas as pd

import utilsforecast.processing as ufp
from utilsforecast.compat import DataFrame, pl

In [None]:
#| exporti
def _function_name(f: Callable):
    if hasattr(f, 'func'):
        # partial fn
        name = f.func.__name__
    else:
        name = f.__name__
    return name

def _quantiles_from_levels(level: List[int]) -> np.ndarray:
    """Returns quantiles associated to `level` and the sorte columns of `model_name`"""
    level = sorted(level)
    alphas = [100 - lv for lv in level]
    quantiles = [alpha / 200 for alpha in reversed(alphas)]
    quantiles.extend([1 - alpha / 200 for alpha in alphas])
    return np.array(quantiles)

def _models_from_levels(model_name: str, level: List[int]) -> List[str]:
    cols = [f'{model_name}-lo-{lv}' for lv in reversed(level)]
    cols.extend([f'{model_name}-hi-{lv}' for lv in level])
    return cols

In [None]:
#| export
def evaluate(
    df: DataFrame,
    metrics: List[Callable],
    models: Optional[List[str]] = None,
    train_df: Optional[DataFrame] = None,
    level: Optional[List[int]] = None,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
    target_col: str = 'y',
    reduce_stat: Optional[str] = None,
) -> DataFrame:
    """Evaluate forecast using different metrics.
    
    Parameters
    ----------
    df : pandas or polars DataFrame
        Forecasts to evaluate.
        Must have `id_col`, `time_col`, `target_col` and models' predictions.
    metrics : list of callable
        Functions with arguments `df`, `models`, `id_col`, `target_col` and optionally `train_df`.
    models : list of str, optional (default=None)
        Names of the models to evaluate.
        If `None` will use every column in the dataframe after removing id, time and target.
    train_df : pandas DataFrame, optional (default=None)
        Training set. Used to evaluate metrics such as `mase`.
    level : list of int, optional (default=None)
        Prediction interval levels. Used to compute losses that rely on quantiles.
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestep, its values can be timestamps or integers.
    target_col : str (default='y')
        Column that contains the target.
    reduce_stat : str, optional (default=None)
        Statistic to compute on the scores by id to reduce them to a single number.

    Returns
    -------
    pandas or polars DataFrame
        Metrics with one row per (id, metric) combination and one column per model.
        If `reduce_stat` is not `None`, there is only one row per metric.
    """
    if models is None:
        model_cols = [
            c for c in df.columns
            if c not in [id_col, time_col, target_col]
            and not re.search(r'-(?:lo|hi)-\d+', c)
        ]
    else:
        model_cols = models

    # interval cols
    if level is not None:
        expected_cols = {
            f'{m}-{side}-{lvl}'
            for m in model_cols
            for side in ('lo', 'hi')
            for lvl in level
        }
        missing = expected_cols - set(df.columns)
        if missing:
            raise ValueError(
                f"The following columns are required for level={level} "
                f"and are missing: {missing}"
            )
    else:
        requires_level = [
            m for m in metrics
            if get_origin(inspect.signature(m).parameters['models'].annotation) is dict
        ]
        if requires_level:
            raise ValueError(
                f"The following metrics require setting `level`: {requires_level}"
            )

    # y_train
    metric_requires_y_train = {_function_name(m): 'train_df' in inspect.signature(m).parameters for m in metrics}
    y_train_metrics = [m for m, requires_yt in metric_requires_y_train.items() if requires_yt]
    if y_train_metrics:
        if train_df is None:
            raise ValueError(
                f'The following metrics require y_train: {y_train_metrics}. '
                'Please provide `train_df`.'
            )
        if isinstance(train_df, pd.DataFrame):
            train_df = train_df.sort_values([id_col, time_col])
        else:
            train_df = train_df.sort([id_col, time_col])
        missing_series = set(df[id_col].unique()) - set(train_df[id_col].unique())
        if missing_series:
            raise ValueError(
                f"The following series are missing from the train_df: {reprlib.repr(missing_series)}"
            )

    results_per_metric = []
    for metric in metrics:
        metric_name = _function_name(metric)
        kwargs = dict(
            df=df,
            models=model_cols,
            id_col=id_col,
            target_col=target_col
        )
        if metric_requires_y_train[metric_name]:
            kwargs['train_df'] = train_df
        metric_params = inspect.signature(metric).parameters
        if 'q' in metric_params or metric_params['models'].annotation is Dict[str, str]:
            assert level is not None  # we've already made sure of this above
            for lvl in level:
                quantiles = _quantiles_from_levels([lvl])
                for q, side in zip(quantiles, ['lo', 'hi']):
                    kwargs['models'] = {model: f'{model}-{side}-{lvl}' for model in model_cols}
                    if 'q' in metric_params:
                        # this is for calibration, since it uses the predictions for q 
                        # but doesn't use it
                        kwargs['q'] = q
                    result = metric(**kwargs)
                    result = ufp.assign_columns(result, 'metric', f'{metric_name}_q{q}')
                    results_per_metric.append(result)
        elif 'quantiles' in metric_params:
            assert level is not None  # we've already made sure of this above
            quantiles = _quantiles_from_levels(level)
            kwargs['quantiles'] = quantiles            
            kwargs['models'] = {model: _models_from_levels(model, level) for model in model_cols}
            result = metric(**kwargs)
            result = ufp.assign_columns(result, 'metric', metric_name)
            results_per_metric.append(result)
        elif 'level' in metric_params:
            assert level is not None  # we've already made sure of this above
            for lvl in level:
                kwargs['level'] = lvl
                result = metric(**kwargs)
                result = ufp.assign_columns(result, 'metric', f'{metric_name}_level{lvl}')
                results_per_metric.append(result)
        else:
            result = metric(**kwargs)
            result = ufp.assign_columns(result, 'metric', metric_name)
            results_per_metric.append(result)
    if isinstance(df, pd.DataFrame):
        df = pd.concat(results_per_metric).reset_index(drop=True)
    else:
        df = pl.concat(results_per_metric, how="diagonal")
    id_cols = [id_col, "metric"]
    model_cols = [c for c in df.columns if c not in id_cols]
    df = df[id_cols + model_cols]
    if reduce_stat is not None:
        df = ufp.group_by_agg(
            df,
            by='metric',
            aggs={m: reduce_stat for m in model_cols},
            maintain_order=True,
        )
    return df

In [None]:
#| hide
from nbdev import show_doc

In [None]:
show_doc(evaluate)

---

[source](https://github.com/Nixtla/utilsforecast/blob/main/utilsforecast/evaluation.py#L43){target="_blank" style="float:right; font-size:smaller"}

### evaluate

>      evaluate
>                (df:Union[pandas.core.frame.DataFrame,polars.dataframe.frame.Da
>                taFrame], metrics:List[Callable],
>                models:Optional[List[str]]=None, train_df:Union[pandas.core.fra
>                me.DataFrame,polars.dataframe.frame.DataFrame,NoneType]=None,
>                level:Optional[List[int]]=None, id_col:str='unique_id',
>                time_col:str='ds', target_col:str='y',
>                reduce_stat:Optional[str]=None)

*Evaluate forecast using different metrics.*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df | Union |  | Forecasts to evaluate.<br>Must have `id_col`, `time_col`, `target_col` and models' predictions. |
| metrics | List |  | Functions with arguments `df`, `models`, `id_col`, `target_col` and optionally `train_df`. |
| models | Optional | None | Names of the models to evaluate.<br>If `None` will use every column in the dataframe after removing id, time and target. |
| train_df | Union | None | Training set. Used to evaluate metrics such as `mase`. |
| level | Optional | None | Prediction interval levels. Used to compute losses that rely on quantiles. |
| id_col | str | unique_id | Column that identifies each serie. |
| time_col | str | ds | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str | y | Column that contains the target. |
| reduce_stat | Optional | None | Statistic to compute on the scores by id to reduce them to a single number. |
| **Returns** | **Union** |  | **Metrics with one row per (id, metric) combination and one column per model.<br>If `reduce_stat` is not `None`, there is only one row per metric.** |

In [None]:
from functools import partial

import numpy as np
import pandas as pd

from utilsforecast.losses import *
from utilsforecast.data import generate_series

In [None]:
series = generate_series(10, n_models=2, level=[80, 95])

In [None]:
series['unique_id'] = series['unique_id'].astype('int')

In [None]:
models = ['model0', 'model1']
metrics = [
    mae,
    mse,
    rmse,
    mape,
    smape,
    partial(mase, seasonality=7),
    quantile_loss,
    mqloss,
    coverage,
    calibration,
    scaled_crps,
]

In [None]:
evaluation = evaluate(
    series,
    metrics=metrics,
    models=models,
    train_df=series,
    level=[80, 95],
)
evaluation

Unnamed: 0,unique_id,metric,model0,model1
0,0,mae,0.158108,0.163246
1,1,mae,0.160109,0.143805
2,2,mae,0.159815,0.170510
3,3,mae,0.168537,0.161595
4,4,mae,0.170182,0.163329
...,...,...,...,...
175,5,scaled_crps,0.034202,0.035472
176,6,scaled_crps,0.034880,0.033610
177,7,scaled_crps,0.034337,0.034745
178,8,scaled_crps,0.033336,0.032459


In [None]:
summary = evaluation.drop(columns='unique_id').groupby('metric').mean().reset_index()
summary

Unnamed: 0,metric,model0,model1
0,calibration_q0.025,0.0,0.0
1,calibration_q0.1,0.0,0.0
2,calibration_q0.9,0.833993,0.815833
3,calibration_q0.975,0.853991,0.836949
4,coverage_level80,0.833993,0.815833
5,coverage_level95,0.853991,0.836949
6,mae,0.161286,0.162281
7,mape,0.048894,0.049624
8,mase,0.966846,0.975354
9,mqloss,0.056904,0.056216


In [None]:
#| hide
#| polars
import polars.testing

In [None]:
#| hide
#| polars
series_pl = generate_series(10, n_models=2, level=[80, 95], engine='polars')
pl_evaluation = (
    evaluate(
        series_pl,
        metrics=metrics,
        train_df=series_pl,
        level=[80, 95],
    ).drop('unique_id')
)
pl_summary = ufp.group_by(pl_evaluation, 'metric').mean()
pd.testing.assert_frame_equal(
    summary.sort_values('metric'),
    pl_summary.sort('metric').to_pandas(),
)
pl.testing.assert_frame_equal(
    evaluate(
        series_pl, metrics=metrics, train_df=series_pl, level=[80, 95], reduce_stat='mean'
    ).sort('metric'),
    pl_summary.sort('metric'),
)

In [None]:
#| hide
#| datasets
from datasetsforecast.evaluation import accuracy as ds_evaluate
import datasetsforecast.losses as ds_losses

In [None]:
#| hide
#| datasets
def daily_mase(y, y_hat, y_train):
    return ds_losses.mase(y, y_hat, y_train, seasonality=7)

for reduce_stat in [None, 'mean']:
    uf_res = evaluate(
        series,
        metrics=metrics,
        models=models,
        train_df=series,
        level=[80, 95],
        reduce_stat=reduce_stat,
    )
    agg_by = None if reduce_stat == 'mean' else ['unique_id']
    ds_res = ds_evaluate(
        series,
        metrics=[
            ds_losses.mae,
            ds_losses.mse,
            ds_losses.rmse,
            ds_losses.mape,
            daily_mase,
            ds_losses.smape,
            ds_losses.quantile_loss,        
            ds_losses.mqloss,
            ds_losses.coverage,        
            ds_losses.calibration,
            ds_losses.scaled_crps,
        ],
        level=[80, 95],
        Y_df=series,
        agg_by=agg_by,
    )
    ds_res['metric'] = ds_res['metric'].str.replace('-', '_')
    ds_res['metric'] = ds_res['metric'].str.replace('q_', 'q')
    ds_res['metric'] = ds_res['metric'].str.replace('lv_', 'level')
    ds_res['metric'] = ds_res['metric'].str.replace('daily_mase', 'mase')
    # utils doesn't multiply pct metrics by 100
    ds_res.loc[ds_res['metric'].str.startswith('coverage'), ['model0', 'model1']] /= 100
    ds_res.loc[ds_res['metric'].eq('mape'), ['model0', 'model1']] /= 100
    # we report smape between 0 and 1 instead of 0-200
    ds_res.loc[ds_res['metric'].eq('smape'), ['model0', 'model1']] /= 200

    ds_res = ds_res[uf_res.columns]
    if reduce_stat is None:
        ds_res = ds_res.sort_values(['unique_id', 'metric'])
        uf_res = uf_res.sort_values(['unique_id', 'metric'])
    else:
        ds_res = ds_res.sort_values('metric')
        uf_res = uf_res.sort_values('metric')
    
    pd.testing.assert_frame_equal(
        uf_res.reset_index(drop=True),
        ds_res.reset_index(drop=True),
    )