In [None]:
#|default_exp utils

# Utils

In [None]:
#|export
import inspect
import random
import reprlib
import warnings
from functools import wraps
from itertools import chain
from math import ceil, log10
from typing import Optional, Union

import numpy as np
import pandas as pd

In [None]:
from fastcore.test import test_eq, test_fail, test_warns

In [None]:
#|export
def generate_daily_series(
    n_series: int, 
    min_length: int = 50,
    max_length: int = 500,
    n_static_features: int = 0,
    equal_ends: bool = False,
    static_as_categorical: bool = True,
    with_trend: bool = False,
    seed: int = 0,
) -> pd.DataFrame:
    """Generates `n_series` of different lengths in the interval [`min_length`, `max_length`].
    
    If `n_static_features > 0`, then each serie gets static features with random values.
    If `equal_ends == True` then all series end at the same date."""
    rng = np.random.RandomState(seed)
    random.seed(seed)
    series_lengths = rng.randint(min_length, max_length + 1, n_series)
    total_length = series_lengths.sum()
    n_digits = ceil(log10(n_series))
    
    dates = pd.date_range('2000-01-01', periods=max_length, freq='D').values
    uids = [
        [f'id_{i:0{n_digits}}'] * serie_length
        for i, serie_length in enumerate(series_lengths)
    ]
    if equal_ends:
        ds = [dates[-serie_length:] for serie_length in series_lengths]
    else:
        ds = [dates[:serie_length] for serie_length in series_lengths]
    y = np.arange(total_length) % 7 + rng.rand(total_length) * 0.5
    series = pd.DataFrame(
        {
            'unique_id': list(chain.from_iterable(uids)),
            'ds': list(chain.from_iterable(ds)),
            'y': y,
        }
    )
    for i in range(n_static_features):
        static_values = np.repeat(rng.randint(0, 100, n_series), series_lengths)
        series[f'static_{i}'] = static_values
        if static_as_categorical:
            series[f'static_{i}'] = series[f'static_{i}'].astype('category')
        if i == 0:
            series['y'] = series['y'] * 0.1 * (1 + static_values)
    series['unique_id'] = series['unique_id'].astype('category')
    series['unique_id'] = series['unique_id'].cat.as_ordered()
    if with_trend:
        coefs = pd.Series(rng.rand(n_series), index=[f'id_{i:0{n_digits}}' for i in range(n_series)])
        trends = series.groupby('unique_id').cumcount()
        trends.index = series['unique_id']
        series['y'] += (coefs * trends).values
    return series

Generate 20 series with lengths between 100 and 1,000.

In [None]:
n_series = 20
min_length = 100
max_length = 1000

series = generate_daily_series(n_series, min_length, max_length)
series

Unnamed: 0,unique_id,ds,y
0,id_00,2000-01-01,0.395863
1,id_00,2000-01-02,1.264447
2,id_00,2000-01-03,2.284022
3,id_00,2000-01-04,3.462798
4,id_00,2000-01-05,4.035518
...,...,...,...
12446,id_19,2002-03-11,0.309275
12447,id_19,2002-03-12,1.189464
12448,id_19,2002-03-13,2.325032
12449,id_19,2002-03-14,3.333198


In [None]:
series_sizes = series.groupby('unique_id').size()
assert series_sizes.size == n_series
assert series_sizes.min() >= min_length
assert series_sizes.max() <= max_length

We can also add static features to each serie (these can be things like product_id or store_id). Only the first static feature (`static_0`) is relevant to the target.

In [None]:
n_static_features = 2

series_with_statics = generate_daily_series(n_series, min_length, max_length, n_static_features)
series_with_statics

Unnamed: 0,unique_id,ds,y,static_0,static_1
0,id_00,2000-01-01,0.752139,18,10
1,id_00,2000-01-02,2.402450,18,10
2,id_00,2000-01-03,4.339642,18,10
3,id_00,2000-01-04,6.579317,18,10
4,id_00,2000-01-05,7.667484,18,10
...,...,...,...,...,...
12446,id_19,2002-03-11,2.783477,89,42
12447,id_19,2002-03-12,10.705175,89,42
12448,id_19,2002-03-13,20.925285,89,42
12449,id_19,2002-03-14,29.998780,89,42


In [None]:
for i in range(n_static_features):
    assert all(series_with_statics.groupby('unique_id')[f'static_{i}'].nunique() == 1)

If `equal_ends=False` (the default) then every serie has a different end date. 

In [None]:
assert series_with_statics.groupby('unique_id')['ds'].max().nunique() > 1

We can have all of them end at the same date by specifying `equal_ends=True`.

In [None]:
series_equal_ends = generate_daily_series(n_series, min_length, max_length, equal_ends=True)

assert series_equal_ends.groupby('unique_id')['ds'].max().nunique() == 1

In [None]:
#|export
def generate_prices_for_series(series: pd.DataFrame, horizon: int = 7, seed: int = 0) -> pd.DataFrame:
    rng = np.random.RandomState(seed)
    unique_last_dates = series.groupby('unique_id')['ds'].max().nunique()
    if unique_last_dates > 1:
        raise ValueError('series must have equal ends.')
    day_offset = pd.tseries.frequencies.Day()
    starts_ends = series.groupby('unique_id')['ds'].agg([min, max])
    dfs = []
    for idx, (start, end) in starts_ends.iterrows():
        product_df = pd.DataFrame(
            {
                'unique_id': idx,
                'price': rng.rand((end - start).days + 1 + horizon),
            },
            index=pd.date_range(start, end + horizon * day_offset, name='ds'),
        )
        dfs.append(product_df)
    prices_catalog = pd.concat(dfs).reset_index()
    return prices_catalog

In [None]:
series_for_prices = generate_daily_series(20, n_static_features=2, equal_ends=True)
series_for_prices.rename(columns={'static_1': 'product_id'}, inplace=True)
prices_catalog = generate_prices_for_series(series_for_prices, horizon=7)
prices_catalog

Unnamed: 0,ds,unique_id,price
0,2000-10-05,id_00,0.548814
1,2000-10-06,id_00,0.715189
2,2000-10-07,id_00,0.602763
3,2000-10-08,id_00,0.544883
4,2000-10-09,id_00,0.423655
...,...,...,...
5009,2001-05-17,id_19,0.288027
5010,2001-05-18,id_19,0.846305
5011,2001-05-19,id_19,0.791284
5012,2001-05-20,id_19,0.578636


In [None]:
test_eq(set(prices_catalog['unique_id']), set(series_for_prices['unique_id']))
test_fail(lambda: generate_prices_for_series(series), contains='equal ends')

In [None]:
#| exporti
def single_split(
    df: pd.DataFrame,
    i_window: int,    
    n_windows: int,
    h: int,
    id_col: str,
    time_col: str,
    freq: Union[pd.offsets.BaseOffset, int],
    max_dates: pd.Series,  
    step_size: Optional[int] = None,
    input_size: Optional[int] = None,
):
    if step_size is None:
        step_size = h
    test_size = h + step_size * (n_windows - 1)
    offset = test_size - i_window * step_size
    train_ends = max_dates - offset * freq
    valid_ends = train_ends + h * freq
    train_mask = df[time_col].le(train_ends)
    if input_size is not None:
        train_mask &= df[time_col].gt(train_ends - input_size * freq)
    train_sizes = train_mask.groupby(df[id_col], observed=True).sum()
    if train_sizes.eq(0).any():
        ids = reprlib.repr(train_sizes[train_sizes.eq(0)].index.tolist())
        raise ValueError(f'The following series are too short for the window: {ids}')        
    valid_mask = df[time_col].gt(train_ends) & df[time_col].le(valid_ends)
    cutoffs = (
        train_ends
        .set_axis(df[id_col])
        .groupby(id_col, observed=True)
        .head(1)
        .rename('cutoff')
    )
    return cutoffs, train_mask, valid_mask

In [None]:
#|export
def backtest_splits(
    df: pd.DataFrame,
    n_windows: int,
    h: int,
    id_col: str,
    time_col: str,
    freq: Union[pd.offsets.BaseOffset, int],
    step_size: Optional[int] = None,
    input_size: Optional[int] = None,
):
    max_dates = df.groupby(id_col, observed=True)[time_col].transform('max')    
    for i in range(n_windows):
        cutoffs, train_mask, valid_mask = single_split(
            df,
            i_window=i,
            n_windows=n_windows,
            h=h,
            id_col=id_col,
            time_col=time_col,
            freq=freq,
            max_dates=max_dates,
            step_size=step_size,
            input_size=input_size,
        )
        train, valid = df[train_mask], df[valid_mask]
        yield cutoffs, train, valid

In [None]:
#| hide
short_series = generate_daily_series(100, max_length=50)
backtest_results = list(
    backtest_splits(
        short_series,
        n_windows=1,
        h=49,
        id_col='unique_id',
        time_col='ds',
        freq=pd.offsets.Day(),
    )
)[0]
test_fail(
    lambda: list(
        backtest_splits(
            short_series,
            n_windows=1,
            h=50,
            id_col='unique_id',
            time_col='ds',
            freq=pd.offsets.Day(),
        )
    ),
    contains='The following series are too short'
)
short_series_int = short_series.copy()
short_series_int['ds'] = short_series.groupby('unique_id').transform('cumcount')
backtest_int_results = list(
    backtest_splits(
        short_series_int,
        n_windows=1,
        h=40,
        id_col='unique_id',
        time_col='ds',
        freq=1
    )
)[0]

In [None]:
#| hide
max_dates = series.groupby('unique_id')['ds'].max()
day_offset = pd.offsets.Day()

def test_backtest_splits(df, n_windows, h, step_size, input_size):
    common_kwargs = dict(
        n_windows=n_windows,
        h=h,
        id_col='unique_id',
        time_col='ds',
        freq=pd.offsets.Day(), 
        step_size=step_size,
        input_size=input_size,        
    )
    permuted_df = df.sample(frac=1.0)
    splits = backtest_splits(df, **common_kwargs)
    splits_on_permuted = list(backtest_splits(permuted_df, **common_kwargs))
    if step_size is None:
        step_size = h
    test_size = h + step_size * (n_windows - 1)
    for window, (cutoffs, train, valid) in enumerate(splits):
        offset = test_size - window * step_size
        expected_max_train_dates = max_dates - day_offset * offset
        max_train_dates = train.groupby('unique_id')['ds'].max()
        pd.testing.assert_series_equal(max_train_dates, expected_max_train_dates)
        pd.testing.assert_series_equal(cutoffs, max_train_dates.rename('cutoff'))
        
        if input_size is not None:
            expected_min_train_dates = expected_max_train_dates - day_offset * (input_size - 1)
            min_train_dates = train.groupby('unique_id')['ds'].min()
            pd.testing.assert_series_equal(min_train_dates, expected_min_train_dates)

        expected_min_valid_dates = expected_max_train_dates + day_offset
        min_valid_dates = valid.groupby('unique_id')['ds'].min()
        pd.testing.assert_series_equal(min_valid_dates, expected_min_valid_dates)

        expected_max_valid_dates = expected_max_train_dates + day_offset * h
        max_valid_dates = valid.groupby('unique_id')['ds'].max()
        pd.testing.assert_series_equal(max_valid_dates, expected_max_valid_dates)

        if window == n_windows - 1:
            pd.testing.assert_series_equal(max_valid_dates, max_dates)

        _, permuted_train, permuted_valid = splits_on_permuted[window]            
        pd.testing.assert_frame_equal(train, permuted_train.sort_values(['unique_id', 'ds']))
    pd.testing.assert_frame_equal(valid, permuted_valid.sort_values(['unique_id', 'ds']))

for step_size in (None, 1, 2):
    for input_size in (None, 4):
        test_backtest_splits(series, n_windows=3, h=14, step_size=step_size, input_size=input_size)

In [None]:
#| exporti
def old_kw_to_pos(old_names, new_positions):
    def decorator(f):
        @wraps(f)
        def inner(*args, **kwargs):
            arg_names = inspect.getfullargspec(f).args
            new_args = list(args)
            for old_name, pos in zip(old_names, new_positions):
                if old_name in kwargs:
                    new_name = arg_names[pos]
                    warnings.warn(
                        f'`{old_name}` has been deprecated, please use `{new_name}` instead.',
                        DeprecationWarning
                    )
                    if len(new_args) > pos:
                        new_args = [*new_args[:pos], kwargs[old_name], *new_args[pos + 1:]]
                    else:
                        new_args = list(new_args)
                        for i in range(len(new_args), pos):
                            new_args.append(kwargs.pop(arg_names[i]))
                        new_args.append(kwargs.pop(old_name))
            return f(*new_args, **kwargs)
        return inner
    return decorator

In [None]:
#| hide
@old_kw_to_pos(['d', 'e'], [0, 2])
def f(a, b, c, *, d=None, e=None):
    return a + b + c

test_eq(f(1, 2, 3), 6)
test_eq(f(a=1, b=2, c=3), 6)
f1 = lambda: f(1, b=2, e=3)
f2 = lambda: f(d=1, b=2, e=3)
with warnings.catch_warnings(record=True) as issued_warnings:
    warnings.simplefilter('always', DeprecationWarning)
    f1()
    f1()
assert all('`e` has been deprecated, please use `c` instead' in str(w.message) for w in issued_warnings)
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    test_eq(f1(), 6)
    test_eq(f2(), 6)

In [None]:
#|export
class PredictionIntervals:
    """Class for storing prediction intervals metadata information."""
    @old_kw_to_pos(['window_size'], [2])
    def __init__(
        self,
        n_windows: int = 2,
        h: int = 1,
        method: str = 'conformal_distribution',
        window_size: Optional[int] = None,  # noqa: ARG002
    ):
        if n_windows < 2:
            raise ValueError('You need at least two windows to compute conformal intervals')
        allowed_methods = ['conformal_error', 'conformal_distribution']            
        if method not in allowed_methods:
            raise ValueError(f'method must be one of {allowed_methods}')
        self.n_windows = n_windows
        self.h = h
        self.method = method
        
    def __repr__(self):
        return f"PredictionIntervals(n_windows={self.n_windows}, h={self.h}, method='{self.method}')"

In [None]:
#| exporti
def _ensure_shallow_copy(df: pd.DataFrame) -> pd.DataFrame:
    from packaging.version import Version
    
    if Version(pd.__version__) < Version("1.4"):
        # https://github.com/pandas-dev/pandas/pull/43406
        df = df.copy()
    return df