In [None]:
#|default_exp utils

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

# Utils

In [None]:
#|export
import random
from itertools import chain
from math import ceil, log10
from typing import Tuple, Union

import numpy as np
import pandas as pd

In [None]:
import dask.dataframe as dd

from fastcore.test import test_eq, test_fail

In [None]:
#|export
def generate_daily_series(
    n_series: int, 
    min_length: int = 50,
    max_length: int = 500,
    n_static_features: int = 0,
    equal_ends: bool = False,
    static_as_categorical: bool = True,
    seed: int = 0,
) -> pd.DataFrame:
    """Generates `n_series` of different lengths in the interval [`min_length`, `max_length`].
    
    If `n_static_features > 0`, then each serie gets static features with random values.
    If `equal_ends == True` then all series end at the same date."""
    rng = np.random.RandomState(seed)
    random.seed(seed)
    series_lengths = rng.randint(min_length, max_length + 1, n_series)
    total_length = series_lengths.sum()
    n_digits = ceil(log10(n_series))
    
    dates = pd.date_range('2000-01-01', periods=max_length, freq='D').values
    uids = [
        [f'id_{i:0{n_digits}}'] * serie_length
        for i, serie_length in enumerate(series_lengths)
    ]
    if equal_ends:
        ds = [dates[-serie_length:] for serie_length in series_lengths]
    else:
        ds = [dates[:serie_length] for serie_length in series_lengths]
    y = np.arange(total_length) % 7 + rng.rand(total_length) * 0.5
    series = pd.DataFrame(
        {
            'unique_id': list(chain.from_iterable(uids)),
            'ds': list(chain.from_iterable(ds)),
            'y': y,
        }
    )
    for i in range(n_static_features):
        static_values = np.repeat(rng.randint(0, 100, n_series), series_lengths)
        series[f'static_{i}'] = static_values
        if static_as_categorical:
            series[f'static_{i}'] = series[f'static_{i}'].astype('category')
        if i == 0:
            series['y'] = series['y'] * (1 + static_values)
    series['unique_id'] = series['unique_id'].astype('category')
    series['unique_id'] = series['unique_id'].cat.as_ordered()
    series = series.set_index('unique_id')
    return series


Generate 20 series with lengths between 100 and 1,000.

In [None]:
n_series = 20
min_length = 100
max_length = 1000

series = generate_daily_series(n_series, min_length, max_length)
series

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
id_00,2000-01-01,0.395863
id_00,2000-01-02,1.264447
id_00,2000-01-03,2.284022
id_00,2000-01-04,3.462798
id_00,2000-01-05,4.035518
...,...,...
id_19,2002-03-11,0.309275
id_19,2002-03-12,1.189464
id_19,2002-03-13,2.325032
id_19,2002-03-14,3.333198


In [None]:
series_sizes = series.groupby('unique_id').size()
assert series_sizes.size == n_series
assert series_sizes.min() >= min_length
assert series_sizes.max() <= max_length

We can also add static features to each serie (these can be things like product_id or store_id). Only the first static feature (`static_0`) is relevant to the target.

In [None]:
n_static_features = 2

series_with_statics = generate_daily_series(n_series, min_length, max_length, n_static_features)
series_with_statics

Unnamed: 0_level_0,ds,y,static_0,static_1
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id_00,2000-01-01,7.521388,18,10
id_00,2000-01-02,24.024502,18,10
id_00,2000-01-03,43.396423,18,10
id_00,2000-01-04,65.793168,18,10
id_00,2000-01-05,76.674843,18,10
...,...,...,...,...
id_19,2002-03-11,27.834771,89,42
id_19,2002-03-12,107.051746,89,42
id_19,2002-03-13,209.252845,89,42
id_19,2002-03-14,299.987801,89,42


In [None]:
for i in range(n_static_features):
    assert all(series_with_statics.groupby('unique_id')[f'static_{i}'].nunique() == 1)

If `equal_ends=False` (the default) then every serie has a different end date. 

In [None]:
assert series_with_statics.groupby('unique_id')['ds'].max().nunique() > 1

We can have all of them end at the same date by specifying `equal_ends=True`.

In [None]:
series_equal_ends = generate_daily_series(n_series, min_length, max_length, equal_ends=True)

assert series_equal_ends.groupby('unique_id')['ds'].max().nunique() == 1

In [None]:
#|export
def generate_prices_for_series(series: pd.DataFrame, horizon: int = 7, seed: int = 0) -> pd.DataFrame:
    rng = np.random.RandomState(0)
    unique_last_dates = series.groupby('unique_id')['ds'].max().nunique()
    if unique_last_dates > 1:
        raise ValueError('series must have equal ends.')
    if 'product_id' not in series:
        raise ValueError('series must have a product_id column.')
    day_offset = pd.tseries.frequencies.Day()
    starts_ends = series.groupby('product_id')['ds'].agg([min, max])
    dfs = []
    for idx, (start, end) in starts_ends.iterrows():
        product_df = pd.DataFrame(
            {
                'product_id': idx,
                'price': rng.rand((end - start).days + 1 + horizon),
            },
            index=pd.date_range(start, end + horizon * day_offset, name='ds'),
        )
        dfs.append(product_df)
    prices_catalog = pd.concat(dfs).reset_index()
    return prices_catalog


In [None]:
series_for_prices = generate_daily_series(20, n_static_features=2, equal_ends=True)
series_for_prices.rename(columns={'static_1': 'product_id'}, inplace=True)
prices_catalog = generate_prices_for_series(series_for_prices, horizon=7)
prices_catalog

Unnamed: 0,ds,product_id,price
0,2000-05-07,9,0.548814
1,2000-05-08,9,0.715189
2,2000-05-09,9,0.602763
3,2000-05-10,9,0.544883
4,2000-05-11,9,0.423655
...,...,...,...
4263,2001-05-17,93,0.800781
4264,2001-05-18,93,0.909013
4265,2001-05-19,93,0.904419
4266,2001-05-20,93,0.327888


In [None]:
test_eq(set(prices_catalog['product_id']), set(series_for_prices['product_id']))
test_fail(lambda: generate_prices_for_series(series_equal_ends), contains='product_id')
test_fail(lambda: generate_prices_for_series(series), contains='equal ends')

In [None]:
#|export
def data_indptr_from_sorted_df(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    grouped = df.groupby('unique_id')
    sizes = grouped.size().values
    indptr = np.append(0, sizes.cumsum())
    data = df['y'].values
    return data, indptr

def ensure_sorted(df: pd.DataFrame) -> pd.DataFrame:
    df = df.set_index('ds', append=True)
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
    return df.reset_index('ds')

In [None]:
#|exporti
def _split_info(data: pd.DataFrame, offset: int, window_size: int, freq: Union[pd.offsets.BaseOffset, int]):
    # TODO: try computing this once and passing it to this fn
    last_dates = data.groupby('unique_id')['ds'].transform('max')
    train_ends = last_dates - offset * freq
    valid_ends = train_ends + window_size * freq
    valid_mask = data['ds'].gt(train_ends) & data['ds'].le(valid_ends)
    return pd.DataFrame({'train_end': train_ends, 'is_valid': valid_mask})

In [None]:
#|export
def backtest_splits(data, n_windows: int, window_size: int, freq: Union[pd.offsets.BaseOffset, int]):
    for i in range(n_windows):
        offset = (n_windows - i) * window_size
        if isinstance(data, pd.DataFrame):
            splits = _split_info(data, offset, window_size, freq)
        else:
            meta = _split_info(data.head(), offset, window_size, freq)
            splits = data.map_partitions(
                _split_info,
                offset=offset,
                window_size=window_size,
                freq=freq,
                meta=meta,
            )
        train_mask = data['ds'].le(splits['train_end'])
        train, valid = data[train_mask], data[splits['is_valid']]
        yield splits.loc[splits['is_valid'], 'train_end'], train, valid

In [None]:
#| hide
n_windows = 3
window_size = 14
max_dates = series.groupby('unique_id')['ds'].max()
day_offset = pd.offsets.Day()
series_ddf = dd.from_pandas(series, npartitions=2)

for df in (series, series_ddf):
    if isinstance(df, pd.DataFrame):
        permuted_df = df.sample(frac=1.)
    else:
        permuted_df = df.map_partitions(lambda part: part.sample(frac=1.), meta=df)    
    splits = backtest_splits(df, n_windows, window_size, pd.offsets.Day())
    splits_on_permuted = list(backtest_splits(permuted_df, n_windows, window_size, pd.offsets.Day()))
    for window, (_, train, valid) in enumerate(splits):
        expected_max_train_dates = max_dates - day_offset * (n_windows - window) * window_size
        max_train_dates = train.groupby('unique_id')['ds'].max()
        if not isinstance(df, pd.DataFrame):
            max_train_dates = max_train_dates.compute()
        pd.testing.assert_series_equal(max_train_dates, expected_max_train_dates)

        expected_min_valid_dates = expected_max_train_dates + day_offset
        min_valid_dates = valid.groupby('unique_id')['ds'].min()
        if not isinstance(df, pd.DataFrame):
            min_valid_dates = min_valid_dates.compute()
        pd.testing.assert_series_equal(min_valid_dates, expected_min_valid_dates)

        expected_max_valid_dates = expected_max_train_dates + day_offset * window_size
        max_valid_dates = valid.groupby('unique_id')['ds'].max()
        if not isinstance(df, pd.DataFrame):
            max_valid_dates = max_valid_dates.compute()
        pd.testing.assert_series_equal(max_valid_dates, expected_max_valid_dates)

        if window == n_windows - 1:
            pd.testing.assert_series_equal(max_valid_dates, max_dates)
            
        _, permuted_train, permuted_valid = splits_on_permuted[window]            
        if not isinstance(df, pd.DataFrame):
            train = train.compute()
            valid = valid.compute()
            permuted_train = permuted_train.compute()
            permuted_valid = permuted_valid.compute()
        pd.testing.assert_frame_equal(train, permuted_train.sort_values(['unique_id', 'ds']))
        pd.testing.assert_frame_equal(valid, permuted_valid.sort_values(['unique_id', 'ds']))