In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp processing

In [None]:
#| export
import re
import reprlib
import warnings
from typing import Any, Dict, Generator, List, NamedTuple, Optional, Tuple, Union

import numpy as np
import pandas as pd
from pandas.tseries.offsets import BaseOffset

from utilsforecast.compat import DataFrame, Series, pl, pl_DataFrame, pl_Series
from utilsforecast.validation import (
    _is_dt_dtype,
    _is_int_dtype,
    ensure_shallow_copy,
    validate_format,
)

In [None]:
import datetime
from datetime import datetime as dt

from fastcore.test import test_eq, test_fail
from nbdev import show_doc

from utilsforecast.compat import POLARS_INSTALLED
from utilsforecast.data import generate_series

In [None]:
#| polars
import polars.testing

In [None]:
#| exporti
def _polars_categorical_to_numerical(serie: pl_Series) -> pl_Series:
    if serie.dtype == pl.Categorical:
        serie = serie.to_physical()
    return serie

In [None]:
#| export
def to_numpy(df: DataFrame) -> np.ndarray:
    if isinstance(df, pd.DataFrame):
        cat_cols = [c for c, dtype in df.dtypes.items() if isinstance(dtype, pd.CategoricalDtype)]
        if cat_cols:
            df = df.copy(deep=False)
            df = ensure_shallow_copy(df)
            for col in cat_cols:
                df[col] = df[col].cat.codes
        df = df.to_numpy()
    else:
        try:
            expr = pl.all().map_batches(_polars_categorical_to_numerical)
        except AttributeError:
            expr = pl.all().map(_polars_categorical_to_numerical)
        df = df.select(expr).to_numpy(order='c')
    return df

In [None]:
#| export
def counts_by_id(df: DataFrame, id_col: str) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        id_counts = df[id_col].value_counts(sort=False, dropna=False)
        ids = id_counts.index
        if isinstance(ids.dtype, pd.CategoricalDtype):
            # there's no observed argument in value_counts
            # so this can return unseen categories
            id_counts = id_counts[id_counts > 0]
            ids = id_counts.index.codes
        sort_idxs = ids.argsort()
        id_counts = id_counts.iloc[sort_idxs].reset_index()
    else:
        id_counts = df[id_col].value_counts().sort(id_col)
    id_counts.columns = [id_col, 'counts']
    return id_counts

In [None]:
#| export
def maybe_compute_sort_indices(
    df: DataFrame, id_col: str, time_col: str
) -> Optional[np.ndarray]:
    """Compute indices that would sort the dataframe
            
    Parameters
    ----------
    df : pandas or polars DataFrame
        Input dataframe with id, times and target values.

    Returns
    -------
    numpy array or None
        Array with indices to sort the dataframe or None if it's already sorted.
    """
    ids = df[id_col]
    times = df[time_col]
    if isinstance(df, pd.DataFrame):
        if isinstance(ids.dtype, pd.CategoricalDtype):
            # we sort categoricals by their codes, this is also done in counts_by_id
            ids = ids.cat.codes
        # pandas series alignment makes this slow, cast to numpy
        ids = ids.to_numpy()
        times = times.to_numpy()
    ids_are_sorted = (ids[:-1] <= ids[1:]).all()
    if ids_are_sorted:
        times_are_sorted = (
            (times[:-1] < times[1:])  # times are ascending
            | (ids[:-1] != ids[1:])  # except when the id changes
        ).all()
        if times_are_sorted:
            return None
    if isinstance(df, pd.DataFrame):
        if pd.api.types.is_object_dtype(df.dtypes[id_col]):
            # MultiIndex.argsort is faster than lexsort for strings            
            sort_idxs = pd.MultiIndex.from_frame(df[[id_col, time_col]]).argsort()
        else:
            sort_idxs = np.lexsort((times, ids))
    else:
        sort_idxs = df.select(
            pl.arg_sort_by([id_col, time_col])
        ).to_series(0).to_numpy()
    return sort_idxs

In [None]:
#| export
def assign_columns(df: DataFrame, names: Union[str, List[str]], values: Union[np.ndarray, pd.Series, pl_Series, List[float]]) -> DataFrame:
    if isinstance(values, list) and (len(values) != df.shape[0] or not isinstance(names, str)):
        raise ValueError('Only single column assignment is supported for lists.')
    if isinstance(df, pd.DataFrame):
        df[names] = values
    else:
        is_scalar = isinstance(values, str) or not hasattr(values, '__len__')
        if is_scalar:
            assert isinstance(names, str)
            vals: Union[pl_DataFrame, pl_Series, pl.Expr] = pl.lit(values).alias(names)
        elif isinstance(values, pl_Series):
            assert isinstance(names, str)
            vals = values.alias(names)
        else:
            if isinstance(values, np.ndarray):
                if isinstance(names, str):
                    names = [names]
                vals = pl.from_numpy(values, schema=names, orient='row')
            elif isinstance(values, list):
                assert isinstance(names, str)
                vals = pl_Series(name=names, values=values)
        df = df.with_columns(vals)
    return df

In [None]:
engines = ['pandas']
if POLARS_INSTALLED:
    engines.append('polars')

In [None]:
for engine in engines:
    series = generate_series(2, engine=engine)
    x = np.random.rand(series.shape[0])
    series = assign_columns(series, 'x', x)
    series = assign_columns(series, ['y', 'z'], np.vstack([x, x]).T)
    series = assign_columns(series, 'ones', 1)
    series = assign_columns(series, 'zeros', np.zeros(series.shape[0]))
    series = assign_columns(series, 'as', 'a')
    np.testing.assert_allclose(
        series[['x', 'y', 'z']],
        np.vstack([x, x, x]).T
    )
    np.testing.assert_equal(series['ones'], np.ones(series.shape[0]))
    np.testing.assert_equal(series['as'], np.full(series.shape[0], 'a'))

In [None]:
#| export
def drop_columns(df: DataFrame, columns: Union[str, List[str]]) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        df = df.drop(columns=columns)
    else:
        df = df.drop(columns)
    return df

In [None]:
#| export
def take_rows(df: Union[DataFrame, Series, np.ndarray], idxs: np.ndarray) -> DataFrame:
    if isinstance(df, (pd.DataFrame, pd.Series)):
        df = df.iloc[idxs]
    else:
        df = df[idxs]
    return df

In [None]:
for engine in engines:
    series = generate_series(2, engine=engine)
    subset = take_rows(series, np.array([0, 2]))
    assert subset.shape[0] == 2

In [None]:
#| export
def filter_with_mask(
    df: Union[Series, DataFrame, pd.Index, np.ndarray],
    mask: Union[np.ndarray, pd.Series, pl_Series]
) -> DataFrame:
    if isinstance(df, (pd.DataFrame, pd.Series, pd.Index, np.ndarray)):
        out = df[mask]
    else:
        out = df.filter(mask)  # type: ignore
    return out

In [None]:
#| export
def is_nan(s: Series) -> Series:
    if isinstance(s, pd.Series):
        out = s.isna()
    else:
        out = s.is_nan()
    return out

In [None]:
np.testing.assert_equal(
    is_nan(pd.Series([np.nan, 1.0, None])).to_numpy(),
    np.array([True, False, True]),
)
if POLARS_INSTALLED:
    np.testing.assert_equal(
        is_nan(pl.Series([np.nan, 1.0, None])).to_numpy(),
        np.array([True, False, None]),
    )

In [None]:
#| export
def is_none(s: Series) -> Series:
    if isinstance(s, pd.Series):
        out = is_nan(s)
    else:
        out = s.is_null()
    return out

In [None]:
np.testing.assert_equal(
    is_none(pd.Series([np.nan, 1.0, None])).to_numpy(),
    np.array([True, False, True]),
)
if POLARS_INSTALLED:
    np.testing.assert_equal(
        is_none(pl.Series([np.nan, 1.0, None])).to_numpy(),
        np.array([False, False, True]),
    )

In [None]:
#| export
def is_nan_or_none(s: Series) -> Series:
    return is_nan(s) | is_none(s)

In [None]:
np.testing.assert_equal(
    is_nan_or_none(pd.Series([np.nan, 1.0, None])).to_numpy(),
    np.array([True, False, True]),
)
if POLARS_INSTALLED:
    np.testing.assert_equal(
        is_nan_or_none(pl.Series([np.nan, 1.0, None])).to_numpy(),
        np.array([True, False, True]),
    )

In [None]:
#| export
def match_if_categorical(s1: Union[Series, pd.Index], s2: Series) -> Tuple[Series, Series]:
    if isinstance(s1.dtype, pd.CategoricalDtype):
        if isinstance(s1, pd.Index):
            cat1 = s1.categories
        else:
            cat1 = s1.cat.categories
        if isinstance(s2.dtype, pd.CategoricalDtype):
            cat2 = s2.cat.categories
        else:
            cat2 = s2.unique().astype(cat1.dtype)
        missing = set(cat2) - set(cat1)
        if missing:
            # we assume the original is s1, so we extend its categories
            new_dtype = pd.CategoricalDtype(categories=cat1.tolist() + sorted(missing))
            s1 = s1.astype(new_dtype)
            s2 = s2.astype(new_dtype)
    elif isinstance(s1, pl_Series) and s1.dtype == pl.Categorical:
        with pl.StringCache():
            cat1 = s1.cat.get_categories()
            if s2.dtype == pl.Categorical:
                cat2 = s2.cat.get_categories()
            else:
                cat2 = s2.unique().sort().cast(cat1.dtype)
            # populate cache, keep original categories first
            pl.concat([cat1, cat2]).cast(pl.Categorical)
            s1 = s1.cast(pl.Utf8).cast(pl.Categorical)
            s2 = s2.cast(pl.Utf8).cast(pl.Categorical)
    return s1, s2

In [None]:
#| export
def vertical_concat(
    dfs: List[Union[DataFrame, Series]],
    match_categories: bool = True
) -> Union[DataFrame, Series]:
    if not dfs:
        raise ValueError("Can't concatenate empty list.")
    if isinstance(dfs[0], pd.Series):
        out = pd.concat(dfs).reset_index(drop=True)
    elif isinstance(dfs[0], pl_Series):
        out = pl.concat(dfs)
    elif isinstance(dfs[0], pd.DataFrame):
        cat_cols = [c for c, dtype in dfs[0].dtypes.items() if isinstance(dtype, pd.CategoricalDtype)]
        if match_categories and cat_cols:
            if len(dfs) > 2:
                raise NotImplementedError('Categorical replacement for more than two dataframes')
            assert len(dfs) == 2
            df1, df2 = dfs
            df1 = df1.copy(deep=False)
            df2 = df2.copy(deep=False)            
            for col in cat_cols:
                s1, s2 = match_if_categorical(df1[col], df2[col])
                df1[col] = s1
                df2[col] = s2
            dfs = [df1, df2]
        out = pd.concat(dfs).reset_index(drop=True)
    else:
        all_cols = dfs[0].columns
        cat_cols = [all_cols[i] for i, dtype in enumerate(dfs[0].dtypes) if dtype == pl.Categorical]
        if match_categories and cat_cols:
            if len(dfs) > 2:
                raise NotImplementedError('Categorical replacement for more than two dataframes')
            assert len(dfs) == 2
            df1, df2 = dfs
            for col in cat_cols:
                s1, s2 = match_if_categorical(df1[col], df2[col])
                df1 = df1.with_columns(s1)
                df2 = df2.with_columns(s2)
            dfs = [df1, df2]
        out = pl.concat(dfs)
    return out

In [None]:
df1 = pd.DataFrame({'x': ['a', 'b', 'c']}, dtype='category')
df2 = pd.DataFrame({'x': ['f', 'b', 'a']}, dtype='category')
pd.testing.assert_series_equal(
    vertical_concat([df1,df2])['x'],
    pd.Series(['a', 'b', 'c', 'f', 'b', 'a'], name='x', dtype=pd.CategoricalDtype(categories=['a', 'b', 'c', 'f']))
)

In [None]:
#| polars
df1 = pl.DataFrame({'x': ['a', 'b', 'c']}, schema={'x': pl.Categorical})
df2 = pl.DataFrame({'x': ['f', 'b', 'a']}, schema={'x': pl.Categorical})
out = vertical_concat([df1,df2])['x']
assert out.equals(pl.Series('x', ['a', 'b', 'c', 'f', 'b', 'a']))
assert out.to_physical().equals(pl.Series('x', [0, 1, 2, 3, 1, 0]))
assert out.cat.get_categories().equals(
    pl.Series('x', ['a', 'b', 'c', 'f'])
)

In [None]:
for engine in engines:
    series = generate_series(2, engine=engine)
    doubled = vertical_concat([series, series])
    assert doubled.shape[0] == 2 * series.shape[0]

In [None]:
#| export
def horizontal_concat(dfs: List[DataFrame]) -> DataFrame:
    if not dfs:
        raise ValueError("Can't concatenate empty list.")
    if isinstance(dfs[0], pd.DataFrame):
        out = pd.concat(dfs, axis=1)
    elif isinstance(dfs[0], pl_DataFrame):
        out = pl.concat(dfs, how='horizontal')
    else:
        raise ValueError(f'Got list of unexpected types: {type(dfs[0])}.')        
    return out

In [None]:
for engine in engines:
    series = generate_series(2, engine=engine)
    renamer = {c: f'{c}_2' for c in series.columns}
    if engine == 'pandas':
        series2 = series.rename(columns=renamer)
    else:
        series2 = series.rename(renamer)
    doubled = horizontal_concat([series, series2])
    assert doubled.shape[1] == 2 * series.shape[1]

In [None]:
#| export
def copy_if_pandas(df: DataFrame, deep: bool = False) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        df = df.copy(deep=deep)
    return df

In [None]:
#| export
def join(
    df1: Union[DataFrame, Series],
    df2: Union[DataFrame, Series],
    on: Union[str, List[str]],
    how: str = 'inner'
) -> DataFrame:
    if isinstance(df1, (pd.Series, pl_Series)):
        df1 = df1.to_frame()
    if isinstance(df2, (pd.Series, pl_Series)):
        df2 = df2.to_frame()
    if isinstance(df1, pd.DataFrame):
        out = df1.merge(df2, on=on, how=how)
    else:
        out = df1.join(df2, on=on, how=how)  # type: ignore
    return out

In [None]:
#| export
def drop_index_if_pandas(df: DataFrame) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        df = df.reset_index(drop=True)
    return df

In [None]:
#| export
def rename(df: DataFrame, mapping: Dict[str, str]) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        df = df.rename(columns=mapping, copy=False)
    else:
        df = df.rename(mapping)
    return df

In [None]:
#| export
def sort(df: DataFrame, by: Optional[Union[str, List[str]]] = None) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        out = df.sort_values(by).reset_index(drop=True)
    elif isinstance(df, (pd.Series, pd.Index)):
        out = df.sort_values()
        if isinstance(out, pd.Series):
            out = out.reset_index(drop=True)
    elif isinstance(df, pl_DataFrame):
        out = df.sort(by)
    else:
        out = df.sort()
    return out

In [None]:
pd.testing.assert_frame_equal(
    sort(pd.DataFrame({'x': [3, 1, 2]}), 'x'),
    pd.DataFrame({'x': [1, 2, 3]})
)
pd.testing.assert_frame_equal(
    sort(pd.DataFrame({'x': [3, 1, 2]}), ['x']),
    pd.DataFrame({'x': [1, 2, 3]})
)
pd.testing.assert_series_equal(
    sort(pd.Series([3, 1, 2])),
    pd.Series([1, 2, 3])
)
pd.testing.assert_index_equal(
    sort(pd.Index([3, 1, 2])),
    pd.Index([1, 2, 3])
)

In [None]:
#| polars
pl.testing.assert_frame_equal(
    sort(pl.DataFrame({'x': [3, 1, 2]}), 'x'),
    pl.DataFrame({'x': [1, 2, 3]}),
)
pl.testing.assert_frame_equal(
    sort(pl.DataFrame({'x': [3, 1, 2]}), ['x']),
    pl.DataFrame({'x': [1, 2, 3]}),
)
pl.testing.assert_series_equal(
    sort(pl.Series('x', [3, 1, 2])),
    pl.Series('x', [1, 2, 3])
)

In [None]:
#| exporti
def _multiply_pl_freq(freq: str, n: Union[int, Series]) -> str:
    freq_n, freq_offset = re.findall(r'(\d+)(\w+)', freq)[0]
    freq_n = int(freq_n)
    if isinstance(n, int):
        total_n = freq_n * n
        out = f'{total_n}{freq_offset}'
    else:
        try:
            is_int = n.dtype.is_integer()
        except AttributeError:
            is_int = n.is_integer()
        if not is_int:
            raise ValueError('`n` must be an integer or a polars series of integers.')
        out = (n * freq_n).cast(pl.Utf8) + freq_offset
    return out

In [None]:
#| polars
test_eq(_multiply_pl_freq('1d', 4), '4d')
test_eq(_multiply_pl_freq('2d', 4), '8d')
pl.testing.assert_series_equal(
    _multiply_pl_freq('1d', pl_Series([1, 2])),
    pl_Series(['1d', '2d']),
)
pl.testing.assert_series_equal(
    _multiply_pl_freq('4m', pl_Series([2, 4])),
    pl_Series(['8m', '16m']),
)

In [None]:
#| exporti
def _ensure_month_ends(
    times: pl_Series,
    orig_times: pl_Series,
    freq: Union[str, int, BaseOffset]
) -> pl_Series:
    if not isinstance(freq, str) or 'mo' not in freq:
        return times
    next_days = orig_times.dt.offset_by('1d')
    month_ends = (next_days.dt.month() != orig_times.dt.month()).all()
    if month_ends:
        times = times.dt.month_end()
    return times

In [None]:
#| export
def offset_times(
    times: Union[Series, pd.Index],
    freq: Union[int, str, BaseOffset],
    n: Union[int, np.ndarray],
) -> Union[Series, pd.Index]:
    if isinstance(times, (pd.Series, pd.Index)):
        if isinstance(freq, str):
            freq = pd.tseries.frequencies.to_offset(freq)
        ints = _is_int_dtype(times) and isinstance(freq, int)
        dts = _is_dt_dtype(times) and isinstance(freq, BaseOffset)
        if not ints and not dts:
            raise ValueError(
                f"Cannot offset times with data type: '{times.dtype}' "
                f"using a frequency of type: '{type(freq)}'."
            )
        out = times + n * freq
    elif isinstance(times, pl_Series) and isinstance(freq, int):
        out = times + n * freq
    elif isinstance(times, pl_Series) and isinstance(freq, str):
        total_offset = _multiply_pl_freq(freq, n)
        out = times.dt.offset_by(total_offset)
        out = _ensure_month_ends(out, times, freq)
    else:
        raise ValueError(
            f"Cannot offset times of type: '{type(times)}' "
            f"using a frequency of type: '{type(freq)}'."
        )
    return out

In [None]:
pd.testing.assert_index_equal(
    offset_times(pd.to_datetime(['2020-01-31', '2020-02-29', '2020-03-31']), pd.offsets.MonthEnd(), 1),
    pd.Index(pd.to_datetime(['2020-02-29', '2020-03-31', '2020-04-30'])),
)
pd.testing.assert_index_equal(
    offset_times(pd.to_datetime(['2020-01-01', '2020-02-01', '2020-03-01']), pd.offsets.MonthBegin(), 1),
    pd.Index(pd.to_datetime(['2020-02-01', '2020-03-01', '2020-04-01'])),
)

In [None]:
#| polars
pl.testing.assert_series_equal(
    offset_times(pl_Series([dt(2020, 1, 31), dt(2020, 2, 28), dt(2020, 3, 31)]), '1mo', 1),
    pl_Series([dt(2020, 2, 29), dt(2020, 3, 28), dt(2020, 4, 30)]),
)
pl.testing.assert_series_equal(
    offset_times(pl_Series([dt(2020, 1, 31), dt(2020, 2, 29), dt(2020, 3, 31)]), '1mo', 1),
    pl_Series([dt(2020, 2, 29), dt(2020, 3, 31), dt(2020, 4, 30)]),
)

In [None]:
#| export
def offset_dates(
    dates: Union[Series, pd.Index],
    freq: Union[int, str, BaseOffset],
    n: Union[int, Series],
) -> Union[Series, pd.Index]:
    warnings.warn("`offset_dates` has been renamed to `offset_times`", category=DeprecationWarning)
    return offset_times(dates, freq, n)

In [None]:
#| export
def time_ranges(
    starts: Union[Series, pd.Index],
    freq: Union[int, str, BaseOffset],
    periods: int,
) -> Series:
    if isinstance(starts, pd.Series):
        starts = pd.Index(starts)
    if isinstance(starts, pd.Index):
        if _is_int_dtype(starts):
            starts_np = starts.to_numpy(copy=False)  # may be pyarrow
            out = np.hstack(
                [
                    np.arange(start, start + freq * periods, freq, dtype=starts_np.dtype)
                    for start in starts_np
                ]
            )
        elif _is_dt_dtype(starts):
            if isinstance(freq, str):
                freq = pd.tseries.frequencies.to_offset(freq)
            out = []
            for i in range(periods):
                out.append([starts + i * freq])
            # pyarrow timestamps don't seem to work with offsets yet, keeping np.vstack
            out = np.vstack(out).ravel(order='F')
        else:
            raise ValueError(f"`starts` must be integers or timestamps, got '{starts.dtype}'.")
        out = pd.Series(out, dtype=starts.dtype)
    else:
        try:
            is_int = starts.dtype.is_integer()
        except AttributeError:
            is_int = starts.is_integer()
        if is_int:
            ends = starts + freq * periods
            out = pl.int_ranges(starts, ends, freq, eager=True).explode()
        else:
            ends = offset_times(starts, freq, periods - 1)
            if starts.dtype == pl.Date:
                ranges_fn = pl.date_ranges
            else:
                ranges_fn = pl.datetime_ranges
            out = ranges_fn(starts, ends, interval=freq, eager=True).explode()
            out = _ensure_month_ends(out, starts, freq)
        out = out.alias(starts.name)
    return out

In [None]:
# datetimes
dates = pd.to_datetime(['2000-01-01', '2010-10-10'])
pd.testing.assert_series_equal(
    time_ranges(dates, freq='D', periods=3),
    pd.Series(pd.to_datetime(['2000-01-01', '2000-01-02', '2000-01-03', '2010-10-10', '2010-10-11', '2010-10-12']))
)
pd.testing.assert_series_equal(
    time_ranges(dates, freq='2D', periods=3),
    pd.Series(pd.to_datetime(['2000-01-01', '2000-01-03', '2000-01-05', '2010-10-10', '2010-10-12', '2010-10-14']))
)
pd.testing.assert_series_equal(
    time_ranges(dates, freq='4D', periods=3),
    pd.Series(pd.to_datetime(['2000-01-01', '2000-01-05', '2000-01-09', '2010-10-10', '2010-10-14', '2010-10-18']))
)
pd.testing.assert_series_equal(
    time_ranges(pd.to_datetime(['2000-01-01', '2010-10-01']), freq=2 * pd.offsets.MonthBegin(), periods=2),
    pd.Series(pd.to_datetime(['2000-01-01', '2000-03-01', '2010-10-01', '2010-12-01']))
)
pd.testing.assert_series_equal(
    time_ranges(pd.to_datetime(['2000-01-01', '2010-01-01']).tz_localize('US/Eastern'), freq=2 * pd.offsets.YearBegin(), periods=2),
    pd.Series(pd.to_datetime(['2000-01-01', '2002-01-01', '2010-01-01', '2012-01-01']).tz_localize('US/Eastern'))
)
pd.testing.assert_series_equal(
    time_ranges(pd.to_datetime(['2000-12-31', '2010-12-31']), freq=2 * pd.offsets.YearEnd(), periods=2),
    pd.Series(pd.to_datetime(['2000-12-31', '2002-12-31', '2010-12-31', '2012-12-31']))
)
# ints
dates = pd.Series([1, 10])
pd.testing.assert_series_equal(
    time_ranges(dates, freq=1, periods=3),
    pd.Series([1, 2, 3, 10, 11, 12])
)
pd.testing.assert_series_equal(
    time_ranges(dates, freq=2, periods=3),
    pd.Series([1, 3, 5, 10, 12, 14])
)
pd.testing.assert_series_equal(
    time_ranges(dates, freq=4, periods=3),
    pd.Series([1, 5, 9, 10, 14, 18])
)

In [None]:
#| polars
# datetimes
dates = pl.Series([dt(2000, 1, 1), dt(2010, 10, 10)])
pl.testing.assert_series_equal(
    time_ranges(dates, freq='1d', periods=3),
    pl.Series([dt(2000, 1, 1), dt(2000, 1, 2), dt(2000, 1, 3), dt(2010, 10, 10), dt(2010, 10, 11), dt(2010, 10, 12)])
)
pl.testing.assert_series_equal(
    time_ranges(dates, freq='2d', periods=3),
    pl.Series([dt(2000, 1, 1), dt(2000, 1, 3), dt(2000, 1, 5), dt(2010, 10, 10), dt(2010, 10, 12), dt(2010, 10, 14)])
)
pl.testing.assert_series_equal(
    time_ranges(dates, freq='4d', periods=3),
    pl.Series([dt(2000, 1, 1), dt(2000, 1, 5), dt(2000, 1, 9), dt(2010, 10, 10), dt(2010, 10, 14), dt(2010, 10, 18)])
)
pl.testing.assert_series_equal(
    time_ranges(pl.Series([dt(2010, 2, 28), dt(2000, 1, 31)]), '1mo', 3),
    pl.Series([dt(2010, 2, 28), dt(2010, 3, 31), dt(2010, 4, 30), dt(2000, 1, 31), dt(2000, 2, 29), dt(2000, 3, 31)])
)
# dates
dates = pl.Series([datetime.date(2000, 1, 1), datetime.date(2010, 10, 10)])
pl.testing.assert_series_equal(
    time_ranges(dates, freq='1d', periods=2),
    pl.Series([
        datetime.date(2000, 1, 1), datetime.date(2000, 1, 2),
        datetime.date(2010, 10, 10), datetime.date(2010, 10, 11),
    ])
)
# ints
dates = pl.Series([1, 10])
pl.testing.assert_series_equal(
    time_ranges(dates, freq=1, periods=3),
    pl.Series([1, 2, 3, 10, 11, 12]),
)
pl.testing.assert_series_equal(
    time_ranges(dates, freq=2, periods=3),
    pl.Series([1, 3, 5, 10, 12, 14]),
)
pl.testing.assert_series_equal(
    time_ranges(dates, freq=4, periods=3),
    pl.Series([1, 5, 9, 10, 14, 18]),
)

In [None]:
#| export
def repeat(
    s: Union[Series, pd.Index, np.ndarray],
    n: Union[int, np.ndarray, Series]
) -> Union[Series, pd.Index, np.ndarray]:
    if isinstance(s, pl_Series):
        if isinstance(n, np.ndarray):
            n = pl_Series(n)
        out = pl.DataFrame(s.alias('x')).select(
            pl.col('x').repeat_by(n)
        )['x'].explode().alias(s.name)
    else:
        out = np.repeat(s, n)
        if isinstance(out, pd.Series):
            out = out.reset_index(drop=True)
    return out

In [None]:
pd.testing.assert_index_equal(
    repeat(pd.CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c']), 2),
    pd.CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'c'], categories=['a', 'b', 'c'])
)
pd.testing.assert_series_equal(
    repeat(pd.Series([1, 2]), 2),
    pd.Series([1, 1, 2, 2])
)
pd.testing.assert_series_equal(
    repeat(pd.Series([1, 2]), pd.Series([2, 3])),
    pd.Series([1, 1, 2, 2, 2]),
)
np.testing.assert_array_equal(
    repeat(np.array([np.datetime64('2000-01-01'), np.datetime64('2010-10-10')]), 2),
    np.array([
        np.datetime64('2000-01-01'), np.datetime64('2000-01-01'),
        np.datetime64('2010-10-10'), np.datetime64('2010-10-10')
    ])
)
np.testing.assert_array_equal(
    repeat(np.array([1, 2]), np.array([2, 3])),
    np.array([1, 1, 2, 2, 2]),
)

In [None]:
#| polars
s = pl.Series(['a', 'b', 'c'], dtype=pl.Categorical)
pl.testing.assert_series_equal(
    repeat(s, 2),
    pl.concat([s, s]).sort()
)
pl.testing.assert_series_equal(
    repeat(pl.Series([2, 4]), 2),
    pl.Series([2, 2, 4, 4])
)
pl.testing.assert_series_equal(
    repeat(pl.Series([1, 2]), np.array([2, 3])),
    pl.Series([1, 1, 2, 2, 2]),
)

In [None]:
#| export
def cv_times(
    times: np.ndarray,
    uids: Union[Series, pd.Index],
    indptr: np.ndarray,
    h: int,
    test_size: int,
    step_size: int,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
) -> DataFrame:
    if test_size < h:
        raise ValueError('`test_size` should be greater or equal to `h`.')
    n, resid = divmod(test_size - h, step_size)
    if resid != 0:
        raise ValueError('`test_size - h` should be a multiple `step_size`')
    n_windows = n + 1
    if isinstance(uids, pl_Series):
        df_constructor = pl_DataFrame
    else:
        df_constructor = pd.DataFrame
    sizes = np.diff(indptr)
    out_times = []
    out_cutoffs = []
    out_ids = []
    for i in range(n_windows):
        offset = test_size - i * step_size + 1
        use_series = sizes >= offset
        cutoff_idxs = indptr[1:][use_series] - offset
        valid_idxs = np.repeat(cutoff_idxs + 1, h) + np.tile(np.arange(h), cutoff_idxs.size)
        out_times.append(times[valid_idxs])
        out_cutoffs.append(np.repeat(times[cutoff_idxs], h))
        if isinstance(uids, pl_Series):
            use_series = pl_Series(use_series)
        out_ids.append(repeat(filter_with_mask(uids, use_series), h))
    return df_constructor(
        {
            id_col: vertical_concat(out_ids),
            time_col: np.hstack(out_times),
            'cutoff': np.hstack(out_cutoffs)
        }
    )

In [None]:
times = np.arange(51, dtype=np.int64)
uids = pd.Series(['id_0'])
indptr = np.array([0, 51])
h = 3
test_size = 5
actual = cv_times(
    times=times,
    uids=uids,
    indptr=indptr,
    h=h,
    test_size=test_size,
    step_size=1,
)
expected = pd.DataFrame({
    'unique_id': 9 * ['id_0'],
    'ds': np.hstack([
        [46, 47, 48],
        [47, 48, 49],
        [48, 49, 50]
    ], dtype=np.int64),
    'cutoff': np.repeat(np.array([45, 46, 47], dtype=np.int64), h),
})
pd.testing.assert_frame_equal(actual, expected)

# step_size=2
actual = cv_times(
    times=times,
    uids=uids,
    indptr=indptr,
    h=h,
    test_size=test_size,
    step_size=2,
)
expected = pd.DataFrame({
    'unique_id': 6 * ['id_0'],
    'ds': np.hstack([
        [46, 47, 48],
        [48, 49, 50]
    ], dtype=np.int64),
    'cutoff': np.repeat(np.array([45, 47], dtype=np.int64), h)
})
pd.testing.assert_frame_equal(actual, expected)

In [None]:
#| export
def group_by(df: Union[Series, DataFrame], by, maintain_order=False):
    if isinstance(df, (pd.Series, pd.DataFrame)):
        out = df.groupby(by, observed=True, sort=not maintain_order)
    else:
        if isinstance(df, pl_Series):
            df = df.to_frame()
        try:
            out = df.group_by(by, maintain_order=maintain_order)
        except AttributeError:
            out = df.groupby(by, maintain_order=maintain_order)
    return out

In [None]:
#| export
def group_by_agg(df: DataFrame, by, aggs, maintain_order=False) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        out = group_by(df, by, maintain_order).agg(aggs).reset_index()
    else:
        out = group_by(df, by, maintain_order).agg(*[getattr(pl.col(c), agg)() for c, agg in aggs.items()])
    return out

In [None]:
pd.testing.assert_frame_equal(
    group_by_agg(pd.DataFrame({'x': [1, 1, 2], 'y': [1, 1, 1]}), 'x', {'y': 'sum'}),
    pd.DataFrame({'x': [1, 2], 'y': [2, 1]})
)

In [None]:
#| polars
pd.testing.assert_frame_equal(
    group_by_agg(pl.DataFrame({'x': [1, 1, 2], 'y': [1, 1, 1]}), 'x', {'y': 'sum'}, maintain_order=True).to_pandas(),
    pd.DataFrame({'x': [1, 2], 'y': [2, 1]})
)

In [None]:
#| export
def is_in(s: Series, collection) -> Series:
    if isinstance(s, pl_Series):
        out = s.is_in(collection)
    else:
        out = s.isin(collection)
    return out

In [None]:
np.testing.assert_equal(is_in(pd.Series([1, 2, 3]), [1]), np.array([True, False, False]))

In [None]:
#| polars
np.testing.assert_equal(is_in(pl.Series([1, 2, 3]), [1]), np.array([True, False, False]))

In [None]:
#| export
def between(s: Series, lower: Series, upper: Series) -> Series:
    if isinstance(s, pd.Series):
        out = s.between(lower, upper)
    else:
        out = s.is_between(lower, upper)
    return out

In [None]:
np.testing.assert_equal(
    between(pd.Series([1, 2, 3]), pd.Series([0, 1, 4]), pd.Series([4, 1, 2])),
    np.array([True, False, False]),
)

In [None]:
#| polars
np.testing.assert_equal(
    between(pl.Series([1, 2, 3]), pl.Series([0, 1, 4]), pl.Series([4, 1, 2])),
    np.array([True, False, False]),
)

In [None]:
#| export
def fill_null(df: DataFrame, mapping: Dict[str, Any]) -> DataFrame:
    if isinstance(df, pd.DataFrame):
        out = df.fillna(mapping)
    else:
        out = df.with_columns(*[pl.col(col).fill_null(v) for col, v in mapping.items()])
    return out

In [None]:
pd.testing.assert_frame_equal(
    fill_null(pd.DataFrame({'x': [1, np.nan], 'y': [np.nan, 2]}), {'x': 2, 'y': 1}),
    pd.DataFrame({'x': [1, 2], 'y': [1, 2]}, dtype='float64')
)

In [None]:
#| polars
pl.testing.assert_frame_equal(
    fill_null(pl.DataFrame({'x': [1, None], 'y': [None, 2]}), {'x': 2, 'y': 1}),
    pl.DataFrame({'x': [1, 2], 'y': [1, 2]})
)

In [None]:
#| export
def cast(s: Series, dtype: type) -> Series:
    if isinstance(s, pd.Series):
        s = s.astype(dtype)
    else:
        s = s.cast(dtype)
    return s

In [None]:
pd.testing.assert_series_equal(
    cast(pd.Series([1, 2, 3]), 'int16'),
    pd.Series([1, 2, 3], dtype='int16')
)

In [None]:
#| polars
pd.testing.assert_series_equal(
    cast(pl.Series('x', [1, 2, 3]), pl.Int16).to_pandas(),
    pd.Series([1, 2, 3], name='x', dtype='int16')
)

In [None]:
#| export
def value_cols_to_numpy(
    df: DataFrame, id_col: str, time_col: str, target_col: Optional[str]
) -> np.ndarray:
    exclude_cols = [id_col, time_col]
    if target_col is not None:
        exclude_cols.append(target_col)
    value_cols = [col for col in df.columns if col not in exclude_cols]
    if target_col is not None:
        value_cols = [target_col, *value_cols]
    data = to_numpy(df[value_cols])
    if data.dtype not in (np.float32, np.float64):
        data = data.astype(np.float32)
    return data

In [None]:
#| export
def make_future_dataframe(
    uids: Series,
    last_times: Union[Series, pd.Index],
    freq: Union[int, str, BaseOffset],
    h: int,
    id_col: str = 'unique_id',
    time_col: str = 'ds'
) -> DataFrame:
    starts = offset_times(last_times, freq, 1)
    if isinstance(uids, pl_Series):
        df_constructor = pl_DataFrame
    else:
        df_constructor = pd.DataFrame
    return df_constructor(
        {
            id_col: repeat(uids, h),
            time_col: time_ranges(starts, freq=freq, periods=h),
        }
    )

In [None]:
pd.testing.assert_frame_equal(
    make_future_dataframe(
        pd.Series([1, 2]), pd.to_datetime(['2000-01-01', '2010-10-10']), freq='D', h=2
    ),
    pd.DataFrame({
        'unique_id': [1, 1, 2, 2],
        'ds': pd.to_datetime(['2000-01-02', '2000-01-03', '2010-10-11', '2010-10-12'])
    })
)

In [None]:
#| polars
pl.testing.assert_frame_equal(
    make_future_dataframe(
        pl.Series([1, 2]),
        pl.Series([dt(2000, 1, 1), dt(2010, 10, 10)]),
        freq='1d',
        h=2,
        id_col='uid',
        time_col='dates',
    ),
    pl.DataFrame({
        'uid': [1, 1, 2, 2],
        'dates': [dt(2000, 1, 2), dt(2000, 1, 3), dt(2010, 10, 11), dt(2010, 10, 12)]
    })
)

In [None]:
#| export
def anti_join(df1: DataFrame, df2: DataFrame, on: Union[str, List[str]]) -> DataFrame:
    if isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame):
        out = df1.merge(df2, on=on, how='left', indicator=True)
        out = out[out['_merge'] == 'left_only'].drop(columns='_merge')
        out = out.reset_index(drop=True)
    elif isinstance(df1, pl_DataFrame) and isinstance(df2, pl_DataFrame):
        out = join(df1, df2, on=on, how='anti')
    else:
        raise ValueError(
            'df1 and df2 must be pandas or polars dataframes of the same type. '
            f"Got type(df1): '{type(df1)}', type(df2): '{type(df2)}'"
        )
    return out

In [None]:
pd.testing.assert_frame_equal(
    anti_join(pd.DataFrame({'x': [1, 2]}), pd.DataFrame({'x': [1]}), on='x'),
    pd.DataFrame({'x': [2]})
)
test_eq(
    anti_join(pd.DataFrame({'x': [1]}), pd.DataFrame({'x': [1]}), on='x').shape[0],
    0,
)

In [None]:
#| polars
pl.testing.assert_frame_equal(
    anti_join(pl_DataFrame({'x': [1, 2]}), pl_DataFrame({'x': [1]}), on='x'),
    pl_DataFrame({'x': [2]})
)
test_eq(
    anti_join(pl_DataFrame({'x': [1]}), pl_DataFrame({'x': [1]}), on='x').shape[0],
    0,
)

In [None]:
#| export
def ensure_sorted(df: DataFrame, id_col: str, time_col: str) -> DataFrame:
    sort_idxs = maybe_compute_sort_indices(df=df, id_col=id_col, time_col=time_col)
    if sort_idxs is not None:
        df = take_rows(df=df, idxs=sort_idxs)
    return df

In [None]:
#| exporti
class _ProcessedDF(NamedTuple):
    uids: Series
    times: np.ndarray
    data: np.ndarray
    indptr: np.ndarray
    sort_idxs: Optional[np.ndarray]

In [None]:
#| export
def process_df(
    df: DataFrame,
    id_col: str,
    time_col: str,
    target_col: Optional[str],
) -> _ProcessedDF:
    """Extract components from dataframe
    
    Parameters
    ----------
    df : pandas or polars DataFrame
        Input dataframe with id, times and target values.

    Returns
    -------
    ids : pandas or polars Serie
        serie with the sorted unique ids present in the data.
    last_times : numpy array
        array with the last time for each serie.
    data : numpy ndarray
        2d array with target plus features values.
    indptr : numpy ndarray
        1d array with indices to the start and end of each serie.
    sort_idxs : numpy array or None
        array with the indices that would sort the original data.
        If the data is already sorted this is `None`.            
    """
    # validations
    validate_format(df, id_col, time_col, target_col)

    # ids
    id_counts = counts_by_id(df, id_col)
    uids = id_counts[id_col]

    # indices
    sizes = id_counts['counts'].to_numpy()
    indptr = np.append(0, sizes.cumsum()).astype(np.int32)
    last_idxs = indptr[1:] - 1

    # data
    data = value_cols_to_numpy(df, id_col, time_col, target_col)

    # check if we need to sort
    sort_idxs = maybe_compute_sort_indices(df, id_col, time_col)
    if sort_idxs is not None:
        data = data[sort_idxs]
        last_idxs = sort_idxs[last_idxs]
    times = df[time_col].to_numpy()[last_idxs]
    return _ProcessedDF(uids, times, data, indptr, sort_idxs)

In [None]:
#| hide
horizon = 3
test_size = 5
for equal_ends in [True, False]:
    n_series = 2
    series = generate_series(n_series, equal_ends=equal_ends)
    freq = pd.tseries.frequencies.to_offset('D')
    uids, last_times, data, indptr, _ = process_df(series, 'unique_id', 'ds', 'y')
    times = series['ds'].to_numpy()
    df_dates = cv_times(
        times=times,
        uids=uids,
        indptr=indptr,
        h=horizon,
        test_size=test_size,
        step_size=1
    )
    test_eq(len(df_dates), n_series * horizon * (test_size - horizon + 1))

In [None]:
#| export
class DataFrameProcessor:
    def __init__(
        self,
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        target_col: str = 'y',
    ):
        self.id_col = id_col
        self.time_col = time_col
        self.target_col = target_col

    def process(
        self,
        df: DataFrame
    ) -> Tuple[Series, np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]]:
        return process_df(df, self.id_col, self.time_col, self.target_col)

In [None]:
static_features = ['static_0', 'static_1']

In [None]:
for n_static_features in [0, 2]:
    series_pd = generate_series(1_000, n_static_features=n_static_features, equal_ends=False, engine='pandas')
    for i in range(n_static_features):
        series_pd[f'static_{i}'] = series_pd[f'static_{i}'].map(lambda x: f'x_{x}').astype('category')
    scrambled_series_pd = series_pd.sample(frac=1.0)
    dfp = DataFrameProcessor('unique_id', 'ds', 'y')
    uids, times, data, indptr, _ = dfp.process(scrambled_series_pd)
    test_eq(times, series_pd.groupby('unique_id', observed=True)['ds'].max().values)
    test_eq(uids, np.sort(series_pd['unique_id'].unique()))
    for i in range(n_static_features):
        series_pd[f'static_{i}'] = series_pd[f'static_{i}'].cat.codes
    test_eq(data, series_pd[['y'] + static_features[:n_static_features]].to_numpy())
    test_eq(np.diff(indptr), series_pd.groupby('unique_id', observed=True).size().values)

In [None]:
#| hide
# test process_df with target_col=None
series_pd = generate_series(10, n_static_features=2, equal_ends=False, engine='pandas')
series_pd = series_pd.rename(columns={'y': 'exog_0'})
_, _, data, indptr, _ = process_df(series_pd, 'unique_id', 'ds', None)
np.testing.assert_allclose(
    data,
    to_numpy(series_pd.drop(columns=['unique_id', 'ds'])),
)

In [None]:
#| polars
for n_static_features in [0, 2]:
    series_pl = generate_series(1_000, n_static_features=n_static_features, equal_ends=False, engine='polars')
    scrambled_series_pl = series_pl.sample(fraction=1.0, shuffle=True)
    dfp = DataFrameProcessor('unique_id', 'ds', 'y')
    uids, times, data, indptr, _ = dfp.process(scrambled_series_pl)
    grouped = group_by(series_pl, 'unique_id')
    test_eq(times, grouped.agg(pl.col('ds').max()).sort('unique_id')['ds'].to_numpy())
    test_eq(uids, series_pl['unique_id'].unique().sort())
    test_eq(data, series_pl.select(pl.col(c).map_batches(lambda s: s.to_physical()) for c in ['y'] + static_features[:n_static_features]).to_numpy())
    test_eq(np.diff(indptr), grouped.count().sort('unique_id')['count'].to_numpy())

In [None]:
#| exporti
def _single_split(
    df: DataFrame,
    i_window: int,    
    n_windows: int,
    h: int,
    id_col: str,
    time_col: str,
    freq: Union[int, str, pd.offsets.BaseOffset],
    max_dates: Series,  
    step_size: Optional[int] = None,
    input_size: Optional[int] = None,
) -> Tuple[DataFrame, Series, Series]:
    if step_size is None:
        step_size = h
    test_size = h + step_size * (n_windows - 1)
    offset = test_size - i_window * step_size
    train_ends = offset_times(max_dates, freq, -offset)
    valid_ends = offset_times(train_ends, freq, h)
    train_mask = df[time_col].le(train_ends)
    valid_mask = df[time_col].gt(train_ends) & df[time_col].le(valid_ends)    
    if input_size is not None:
        train_starts = offset_times(train_ends, freq, -input_size)
        train_mask &= df[time_col].gt(train_starts)
    if isinstance(train_mask, pd.Series):
        train_sizes = train_mask.groupby(df[id_col], observed=True, sort=False).sum()
        train_sizes = train_sizes.reset_index()
    else:
        tmp_df = pl.DataFrame({id_col: df[id_col], time_col: train_mask})
        train_sizes = group_by_agg(tmp_df, id_col, {time_col: 'sum'}, maintain_order=True)
    zeros_mask = train_sizes[time_col].eq(0)
    if zeros_mask.all():
        raise ValueError(
            'All series are too short for the cross validation settings, '
            f'at least {offset + 1} samples are required.\n'
            'Please reduce `n_windows` or `h`.'
        )
    elif zeros_mask.any():
        ids = filter_with_mask(train_sizes[id_col], zeros_mask)
        warnings.warn(
            'The following series are too short for the window '
            f'and will be dropped: {reprlib.repr(list(ids))}'
        )
        dropped_ids = is_in(df[id_col], ids)
        valid_mask &= ~dropped_ids
    if isinstance(train_ends, pd.Series):
        cutoffs: DataFrame = (
            train_ends
            .set_axis(df[id_col])
            .groupby(id_col, observed=True)
            .head(1)
            .rename("cutoff")
            .reset_index()
        )
    else:
        cutoffs = train_ends.to_frame().with_columns(df[id_col])
        cutoffs = (
            group_by(cutoffs, id_col)
            .agg(pl.col(time_col).head(1))
            .explode(pl.col(time_col))
            .rename({time_col: 'cutoff'})
        )
    return cutoffs, train_mask, valid_mask

In [None]:
#|export
def backtest_splits(
    df: DataFrame,
    n_windows: int,
    h: int,
    id_col: str,
    time_col: str,
    freq: Union[int, str, pd.offsets.BaseOffset],
    step_size: Optional[int] = None,
    input_size: Optional[int] = None,
) -> Generator[Tuple[DataFrame, DataFrame, DataFrame], None, None]:
    if isinstance(df, pd.DataFrame):
        max_dates = df.groupby(id_col, observed=True)[time_col].transform('max')
    else:
        max_dates = df.select(pl.col(time_col).max().over(id_col))[time_col]
    for i in range(n_windows):
        cutoffs, train_mask, valid_mask = _single_split(
            df,
            i_window=i,
            n_windows=n_windows,
            h=h,
            id_col=id_col,
            time_col=time_col,
            freq=freq,
            max_dates=max_dates,
            step_size=step_size,
            input_size=input_size,
        )
        train = filter_with_mask(df, train_mask)
        valid = filter_with_mask(df, valid_mask)
        yield cutoffs, train, valid

In [None]:
#| hide
short_series = generate_series(100, max_length=50)
backtest_results = list(
    backtest_splits(
        short_series,
        n_windows=1,
        h=49,
        id_col='unique_id',
        time_col='ds',
        freq=pd.offsets.Day(),
    )
)[0]
test_fail(
    lambda: list(
        backtest_splits(
            short_series,
            n_windows=1,
            h=50,
            id_col='unique_id',
            time_col='ds',
            freq=pd.offsets.Day(),
        )
    ),
    contains='at least 51 samples are required'
)
some_short_series = generate_series(100, min_length=20, max_length=100)
with warnings.catch_warnings(record=True) as issued_warnings:
    warnings.simplefilter('always', UserWarning)
    splits = list(
        backtest_splits(
            some_short_series,
            n_windows=1,
            h=50,
            id_col='unique_id',
            time_col='ds',
            freq=pd.offsets.Day(),
        )
    )
    assert any('will be dropped' in str(w.message) for w in issued_warnings)
short_series_int = short_series.copy()
short_series_int['ds'] = short_series.groupby('unique_id', observed=True).transform('cumcount')
backtest_int_results = list(
    backtest_splits(
        short_series_int,
        n_windows=1,
        h=40,
        id_col='unique_id',
        time_col='ds',
        freq=1
    )
)[0]

In [None]:
#| hide
def test_backtest_splits(df, n_windows, h, step_size, input_size):
    max_dates = df.groupby('unique_id', observed=True)['ds'].max()
    day_offset = pd.offsets.Day()    
    common_kwargs = dict(
        n_windows=n_windows,
        h=h,
        id_col='unique_id',
        time_col='ds',
        freq=pd.offsets.Day(), 
        step_size=step_size,
        input_size=input_size,        
    )
    permuted_df = df.sample(frac=1.0)
    splits = backtest_splits(df, **common_kwargs)
    splits_on_permuted = list(backtest_splits(permuted_df, **common_kwargs))
    if step_size is None:
        step_size = h
    test_size = h + step_size * (n_windows - 1)
    for window, (cutoffs, train, valid) in enumerate(splits):
        offset = test_size - window * step_size
        expected_max_train_dates = max_dates - day_offset * offset
        max_train_dates = train.groupby('unique_id', observed=True)['ds'].max()
        pd.testing.assert_series_equal(max_train_dates, expected_max_train_dates)
        pd.testing.assert_frame_equal(cutoffs, max_train_dates.rename('cutoff').reset_index())
        
        if input_size is not None:
            expected_min_train_dates = expected_max_train_dates - day_offset * (input_size - 1)
            min_train_dates = train.groupby('unique_id', observed=True)['ds'].min()
            pd.testing.assert_series_equal(min_train_dates, expected_min_train_dates)

        expected_min_valid_dates = expected_max_train_dates + day_offset
        min_valid_dates = valid.groupby('unique_id', observed=True)['ds'].min()
        pd.testing.assert_series_equal(min_valid_dates, expected_min_valid_dates)

        expected_max_valid_dates = expected_max_train_dates + day_offset * h
        max_valid_dates = valid.groupby('unique_id', observed=True)['ds'].max()
        pd.testing.assert_series_equal(max_valid_dates, expected_max_valid_dates)

        if window == n_windows - 1:
            pd.testing.assert_series_equal(max_valid_dates, max_dates)

        _, permuted_train, permuted_valid = splits_on_permuted[window]            
        pd.testing.assert_frame_equal(train, permuted_train.sort_values(['unique_id', 'ds']))
    pd.testing.assert_frame_equal(valid, permuted_valid.sort_values(['unique_id', 'ds']))

n_series = 20
min_length = 100
max_length = 1000
series = generate_series(n_series, freq='D', min_length=min_length, max_length=max_length)

for step_size in (None, 1, 2):
    for input_size in (None, 4):
        test_backtest_splits(series, n_windows=3, h=14, step_size=step_size, input_size=input_size)

In [None]:
#| hide
#| polars
h = 10
series_pl = generate_series(n_series, freq='D', min_length=min_length, max_length=max_length, engine='polars')
splits = backtest_splits(series_pl, n_windows=3, h=h, id_col='unique_id', time_col='ds', freq='1d')
for cutoffs, train, valid in splits:
    train_ends = train.group_by('unique_id', maintain_order=True).agg(pl.col('ds').max())
    valid_starts = valid.group_by('unique_id', maintain_order=True).agg(pl.col('ds').min())
    valid_ends = valid.group_by('unique_id', maintain_order=True).agg(pl.col('ds').max())
    expected_valid_starts = offset_times(train_ends['ds'], '1d', 1)
    expected_valid_ends = offset_times(train_ends['ds'], '1d', h)
    pl.testing.assert_series_equal(valid_starts['ds'], expected_valid_starts)
    pl.testing.assert_series_equal(valid_ends['ds'], expected_valid_ends)

In [None]:
#| export
def add_insample_levels(
    df: DataFrame,
    models: List[str],
    level: List[Union[int, float]],
    id_col: str = 'unique_id',
    target_col: str = 'y',
) -> DataFrame:
    import operator

    from scipy.stats import norm

    df = copy_if_pandas(df, deep=False)
    cuts = norm.ppf(0.5 + np.asarray(level) / 200).reshape(1, -1)
    if isinstance(df, pd.DataFrame):
        errors = df[models].sub(df[target_col], axis=0)
        stds = errors.groupby(df[id_col], observed=True).transform('std')
    else:
        exprs = (pl.col(m).sub(pl.col(target_col)).std().over(id_col) for m in models)
        stds = df.select(exprs)
    stds = to_numpy(stds)
    preds = to_numpy(df[models])
    vals = np.empty_like(preds, shape=(preds.shape[0], len(models) * 2 * len(level)))
    cols = []
    k = 0
    for i, model in enumerate(models):
        widths = cuts * stds[:, [i]]
        for side, op in {'lo': operator.sub, 'hi': operator.add}.items():
            for j, lvl in enumerate(level):        
                cols.append(f'{model}-{side}-{lvl}')
                vals[:, k] = op(preds[:, i], widths[:, j])
                k += 1
    return assign_columns(df, cols, vals)

In [None]:
#| scipy
series = generate_series(100, n_models=2)
models = ['model0', 'model1']
levels = [80, 95]
with_levels = add_insample_levels(series, models, levels)
for model in models:
    for lvl in levels:
        assert with_levels[f'{model}-lo-{lvl}'].lt(with_levels[f'{model}-hi-{lvl}']).all()

In [None]:
#| polars
#| scipy
series_pl = generate_series(100, n_models=2, engine='polars')
with_levels_pl = add_insample_levels(series_pl, ['model0', 'model1'], [80, 95])
pd.testing.assert_frame_equal(
    with_levels.drop(columns='unique_id'),
    with_levels_pl.to_pandas().drop(columns='unique_id')
)