In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp evaluation

# Evaluation
> Model performance evaluation

In [None]:
#| export
import inspect
import re
import reprlib
from typing import Callable, List, Optional

import pandas as pd

from utilsforecast.compat import DataFrame, pl_concat, pl_lit

In [None]:
#| exporti
def _function_name(f: Callable):
    if hasattr(f, 'func'):
        # partial fn
        name = f.func.__name__
    else:
        name = f.__name__
    return name

In [None]:
#| export
def evaluate(
    df: DataFrame,
    metrics: List[Callable],
    models: Optional[List[str]] = None,
    train_df: Optional[DataFrame] = None,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
    target_col: str = 'y',
) -> DataFrame:
    """Evaluate forecast using different metrics.
    
    Parameters
    ----------
    df : pandas or polars DataFrame
        Forecasts to evaluate.
        Must have `id_col`, `time_col`, `target_col` and models' predictions.
    metrics : list of callable
        Functions with arguments `y`, `y_hat`, and optionally `y_train`.
    models : list of str, optional (default=None)
        Names of the models to evaluate.
        If `None` will use every column in the dataframe after removing id, time and target.
    train_df : pandas DataFrame, optional (default=None)
        Training set. Used to evaluate metrics such as `mase`. 
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestep, its values can be timestamps or integers.
    target_col : str (default='y')
        Column that contains the target.

    Returns
    -------
    pandas or polars DataFrame
        Metrics with one row per (id, metric) combination and one column per model.
    """
    if models is None:
        model_cols = [
            c for c in df.columns
            if c not in [id_col, time_col, target_col]
            and not re.search(r'-(?:lo|hi)-\d+', c)
        ]
    else:
        model_cols = models

    # y_train
    metric_requires_y_train = {_function_name(m): 'train_df' in inspect.signature(m).parameters for m in metrics}
    y_train_metrics = [m for m, requires_yt in metric_requires_y_train.items() if requires_yt]
    if y_train_metrics:
        if train_df is None:
            raise ValueError(
                f'The following metrics require y_train: {y_train_metrics}. '
                'Please provide `train_df`.'
            )
        if isinstance(train_df, pd.DataFrame):
            train_df = train_df.sort_values([id_col, time_col])
        else:
            train_df = train_df.sort([id_col, time_col])
        missing_series = set(df[id_col].unique()) - set(train_df[id_col].unique())
        if missing_series:
            raise ValueError(
                f"The following series are missing from the train_df: {reprlib.repr(missing_series)}"
            )

    results_per_metric = []
    for metric in metrics:
        metric_name = _function_name(metric)
        kwargs = dict(
            df=df,
            models=model_cols,
            id_col=id_col,
            target_col=target_col
        )
        if metric_requires_y_train[metric_name]:
            kwargs['train_df'] = train_df
        result = metric(**kwargs)
        if isinstance(result, pd.DataFrame):
            result['metric'] = metric_name
        else:
            result = result.with_columns(pl_lit(metric_name).alias('metric'))
        results_per_metric.append(result)
    if isinstance(df, pd.DataFrame):
        df = pd.concat(results_per_metric)
        out_cols = [c for c in df.columns if c not in (id_col, 'metric')]
        df = df[[id_col, 'metric', *out_cols]]
    else:
        df = pl_concat(results_per_metric, how='diagonal')
        out_cols = [c for c in df.columns if c not in (id_col, 'metric')]
        df = df.select([id_col, 'metric', *out_cols])
    return df

In [None]:
#| hide
from nbdev import show_doc

In [None]:
show_doc(evaluate)

---

[source](https://github.com/Nixtla/utilsforecast/blob/main/utilsforecast/evaluation.py#L29){target="_blank" style="float:right; font-size:smaller"}

### evaluate

>      evaluate
>                (df:Union[pandas.core.frame.DataFrame,polars.dataframe.frame.Da
>                taFrame], metrics:List[Callable],
>                models:Optional[List[str]]=None, train_df:Union[pandas.core.fra
>                me.DataFrame,polars.dataframe.frame.DataFrame,NoneType]=None,
>                id_col:str='unique_id', time_col:str='ds', target_col:str='y')

Evaluate forecast using different metrics.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| df | Union |  | Forecasts to evaluate.<br>Must have `id_col`, `time_col`, `target_col` and models' predictions. |
| metrics | List |  | Functions with arguments `y`, `y_hat`, and optionally `y_train`. |
| models | Optional | None | Names of the models to evaluate.<br>If `None` will use every column in the dataframe after removing id, time and target. |
| train_df | Union | None | Training set. Used to evaluate metrics such as `mase`.  |
| id_col | str | unique_id | Column that identifies each serie. |
| time_col | str | ds | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str | y | Column that contains the target. |
| **Returns** | **Union** |  | **Metrics with one row per (id, metric) combination and one column per model.** |

In [None]:
from functools import partial

import pandas as pd
import polars as pl

from utilsforecast.losses import *
from utilsforecast.data import generate_series

In [None]:
series = generate_series(10, static_as_categorical=False)
rng = np.random.RandomState(0)
models = ['model1', 'model2']
for model in models:
    series[model] = series['y'] * rng.rand(series.shape[0])
    series[f'{model}-lo-80'] = series[model] * np.random.rand(series.shape[0])
    series[f'{model}-hi-80'] = series[model] * (1 + np.random.rand(series.shape[0]))
series_pl = pl.from_pandas(series)

In [None]:
metrics = [
    mae,
    mse,
    rmse,
    mape,
    smape,
    partial(mase, seasonality=7),
    partial(rmae, baseline_models=list(reversed(models))),
    quantile_loss,
    partial(mqloss, quantiles=np.array([0.3, 0.5])),
    partial(coverage, level=80),
    partial(calibration, level=80),
    partial(scaled_crps, quantiles=np.array([0.3, 0.5])),
]

In [None]:
pd_evaluation = (
    evaluate(
        series,
        metrics=metrics,
        models=models,
        train_df=series,
    ).drop(columns='unique_id')
    .groupby('metric')
    .mean()
    .reset_index()
)
pd_evaluation

Unnamed: 0,metric,model1,model2,model1_div_model2,model2_div_model1
0,calibration,0.309941,0.308847,,
1,coverage,0.309941,0.308847,,
2,mae,1.601594,1.621353,,
3,mape,0.496531,0.500453,,
4,mase,9.624164,9.71139,,
5,mqloss,0.640637,0.648541,,
6,mse,4.720874,4.790493,,
7,quantile_loss,0.800797,0.810676,,
8,rmae,,,0.992867,1.01428
9,rmse,2.171034,2.185081,,


In [None]:
pl_evaluation = (
    evaluate(
        series_pl,
        metrics=metrics,
        train_df=series_pl,
    ).drop(columns='unique_id')
    .group_by('metric')
    .mean()
)

In [None]:
pd.testing.assert_frame_equal(
    pd_evaluation.sort_values('metric'),
    pl_evaluation.sort('metric').to_pandas(),
)