In [None]:
#default_exp utils

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Utils

In [None]:
#export
import random
from itertools import chain
from math import ceil, log10
from typing import Tuple

import numpy as np
import pandas as pd
from numba import njit


In [None]:
import dask.dataframe as dd

In [None]:
#export
def generate_daily_series(
    n_series: int, 
    min_length: int = 50,
    max_length: int = 500,
    n_static_features: int = 0,
    equal_ends: bool = False,
    seed: int = 0,
) -> pd.DataFrame:
    """Generates `n_series` of different lengths in the interval [`min_length`, `max_length`].
    
    If `n_static_features > 0`, then each serie gets static features with random values.
    If `equal_ends == True` then all series end at the same date."""
    rng = np.random.RandomState(seed)
    random.seed(seed)
    series_lengths = rng.randint(min_length, max_length + 1, n_series)
    total_length = series_lengths.sum()
    n_digits = ceil(log10(n_series))
    
    dates = pd.date_range('2000-01-01', periods=max_length, freq='D').values
    uids = [
        [f'id_{i:0{n_digits}}'] * serie_length
        for i, serie_length in enumerate(series_lengths)
    ]
    if equal_ends:
        ds = [dates[-serie_length:] for serie_length in series_lengths]
    else:
        ds = [dates[:serie_length] for serie_length in series_lengths]
    y = np.arange(total_length) % 7 + rng.rand(total_length) * 0.5
    series = pd.DataFrame(
        {
            'unique_id': list(chain.from_iterable(uids)),
            'ds': list(chain.from_iterable(ds)),
            'y': y,
        }
    )
    for i in range(n_static_features):
        static_values = [
            [random.randint(0, 100)] * serie_length for serie_length in series_lengths
        ]
        series[f'static_{i}'] = list(chain.from_iterable(static_values))
        series[f'static_{i}'] = series[f'static_{i}'].astype('category')
        if i == 0:
            series['y'] = series['y'] * (1 + series[f'static_{i}'].cat.codes)
    series['unique_id'] = series['unique_id'].astype('category')
    series['unique_id'] = series['unique_id'].cat.as_ordered()
    series = series.set_index('unique_id')
    return series


Generate 20 series with lengths between 100 and 1,000.

In [None]:
n_series = 20
min_length = 100
max_length = 1000

series = generate_daily_series(n_series, min_length, max_length)
series

In [None]:
series_sizes = series.groupby('unique_id').size()
assert series_sizes.size == n_series
assert series_sizes.min() >= min_length
assert series_sizes.max() <= max_length

We can also add static features to each serie (these can be things like product_id or store_id). Only the first static feature (`static_0`) is relevant to the target.

In [None]:
n_static_features = 2

series_with_statics = generate_daily_series(n_series, min_length, max_length, n_static_features)
series_with_statics

In [None]:
for i in range(n_static_features):
    assert all(series_with_statics.groupby('unique_id')[f'static_{i}'].nunique() == 1)

If `equal_ends=False` (the default) then every serie has a different end date. 

In [None]:
assert series_with_statics.groupby('unique_id')['ds'].max().nunique() > 1

We can have all of them end at the same date by specifying `equal_ends=True`.

In [None]:
series_equal_ends = generate_daily_series(n_series, min_length, max_length, equal_ends=True)

assert series_equal_ends.groupby('unique_id')['ds'].max().nunique() == 1

In [None]:
#exporti
@njit
def _get_last_n_mask(x: np.ndarray, n: int) -> np.ndarray:
    n_samples = x.size
    mask = np.full(n_samples, True)
    n_first = max(0, n_samples - n)
    mask[:n_first] = False
    return mask


In [None]:
#hide
np.testing.assert_equal(_get_last_n_mask(np.array([1, 2, 3]), 2), np.array([False, True, True]))
np.testing.assert_equal(_get_last_n_mask(np.array([1, 2, 3]), 4), np.array([True, True, True]))

In [None]:
#export
def data_indptr_from_sorted_df(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
    grouped = df.groupby('unique_id')
    sizes = grouped.size().values
    indptr = np.append(0, sizes.cumsum())
    data = df['y'].values
    return data, indptr


In [None]:
#exporti
@njit
def _get_mask(data: np.ndarray, indptr: np.ndarray, n: int) -> np.ndarray:
    mask = np.empty_like(data)
    for start, end in zip(indptr[:-1], indptr[1:]):
        mask[start:end] = _get_last_n_mask(data[start:end], n)
    return mask


def _get_dataframe_mask(df, n) -> pd.Series:
    data, indptr = data_indptr_from_sorted_df(df)
    mask = _get_mask(data, indptr, n)
    return mask.astype(bool)


def _split_frame(data, n_windows, window, valid_size):
    full_valid_size = (n_windows - window) * valid_size
    extra_valid_size = full_valid_size - valid_size
    if isinstance(data, pd.DataFrame):
        full_valid_mask = _get_dataframe_mask(data, full_valid_size)
        train_mask = ~full_valid_mask
        extra_valid_mask = _get_dataframe_mask(data, extra_valid_size)
    else:
        full_valid_mask = data.map_partitions(
            _get_dataframe_mask, full_valid_size, meta=bool
        )
        train_mask = ~full_valid_mask
        extra_valid_mask = data.map_partitions(
            _get_dataframe_mask, extra_valid_size, meta=bool
        )
    valid_mask = full_valid_mask & ~extra_valid_mask
    return data[train_mask], data[valid_mask]


In [None]:
#export
def backtest_splits(data, n_windows, window_size):
    """Returns a generator of `n_windows` for train, valid splits of 
    `data` where each valid has `window_size` samples."""
    for window in range(n_windows):
        train, valid = _split_frame(data, n_windows, window, window_size)
        yield train, valid


In [None]:
n_windows = 3
window_size = 14
max_dates = series.groupby('unique_id')['ds'].max()
day_offset = pd.tseries.frequencies.Day()
series_ddf = dd.from_pandas(series, npartitions=2)

for df in (series, series_ddf):
    splits = backtest_splits(df, n_windows, window_size)
    for window, (train, valid) in enumerate(splits):
        expected_max_train_dates = max_dates - day_offset * (n_windows - window) * window_size
        max_train_dates = train.groupby('unique_id')['ds'].max()
        if not isinstance(df, pd.DataFrame):
            max_train_dates = max_train_dates.compute()
        assert max_train_dates.equals(expected_max_train_dates)

        expected_min_valid_dates = expected_max_train_dates + day_offset
        min_valid_dates = valid.groupby('unique_id')['ds'].min()
        if not isinstance(df, pd.DataFrame):
            min_valid_dates = min_valid_dates.compute()
        assert min_valid_dates.equals(expected_min_valid_dates)

        expected_max_valid_dates = expected_max_train_dates + day_offset * window_size
        max_valid_dates = valid.groupby('unique_id')['ds'].max()
        if not isinstance(df, pd.DataFrame):
            max_valid_dates = max_valid_dates.compute()
        assert max_valid_dates.equals(expected_max_valid_dates)

        if window == n_windows - 1:
            assert max_valid_dates.equals(max_dates)