In [None]:
#|default_exp core

# Core

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

In [None]:
#|export
import copy
import inspect
import reprlib
import warnings
from collections import Counter, OrderedDict
from contextlib import contextmanager
from pathlib import Path
from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    Iterator,
    List,
    Mapping,
    Optional,
    Tuple,
    Union,
)

import cloudpickle
import fsspec
import numpy as np
import pandas as pd
import utilsforecast.processing as ufp
from sklearn.base import BaseEstimator, clone
from sklearn.pipeline import Pipeline
from utilsforecast.compat import (
    DFType,
    DataFrame,
    pl,
    pl_DataFrame,
    pl_Series,
)
from utilsforecast.validation import validate_format, validate_freq

from mlforecast.grouped_array import GroupedArray
from mlforecast.lag_transforms import _BaseLagTransform, Lag
from mlforecast.target_transforms import (
    _BaseGroupedArrayTargetTransform,
    BaseTargetTransform,
)
from mlforecast.utils import _ShortSeriesException, _ensure_shallow_copy

In [None]:
import datetime

import tempfile
from nbdev import show_doc
from fastcore.test import test_eq, test_fail, test_warns
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean
from window_ops.shift import shift_array

from mlforecast.callbacks import SaveFeatures
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences, LocalStandardScaler
from mlforecast.utils import generate_daily_series, generate_prices_for_series

## Data format

The required input format is a dataframe with at least the following columns:
* `unique_id` with a unique identifier for each time serie
* `ds` with the datestamp and a column
* `y` with the values of the serie.

Every other column is considered a static feature unless stated otherwise in `TimeSeries.fit`

In [None]:
series = generate_daily_series(20, n_static_features=2)
series

Unnamed: 0,unique_id,ds,y,static_0,static_1
0,id_00,2000-01-01,7.404529,27,53
1,id_00,2000-01-02,35.952624,27,53
2,id_00,2000-01-03,68.958353,27,53
3,id_00,2000-01-04,84.994505,27,53
4,id_00,2000-01-05,113.219810,27,53
...,...,...,...,...,...
4869,id_19,2000-03-25,400.606807,97,45
4870,id_19,2000-03-26,538.794824,97,45
4871,id_19,2000-03-27,620.202104,97,45
4872,id_19,2000-03-28,20.625426,97,45


For simplicity we'll just take one time serie here.

In [None]:
uids = series['unique_id'].unique()
serie = series[series['unique_id'].eq(uids[0])]
serie

Unnamed: 0,unique_id,ds,y,static_0,static_1
0,id_00,2000-01-01,7.404529,27,53
1,id_00,2000-01-02,35.952624,27,53
2,id_00,2000-01-03,68.958353,27,53
3,id_00,2000-01-04,84.994505,27,53
4,id_00,2000-01-05,113.219810,27,53
...,...,...,...,...,...
217,id_00,2000-08-05,13.263188,27,53
218,id_00,2000-08-06,38.231981,27,53
219,id_00,2000-08-07,59.555183,27,53
220,id_00,2000-08-08,86.986368,27,53


In [None]:
#|exporti
date_features_dtypes = {
    "year": np.uint16,
    "month": np.uint8,
    "day": np.uint8,
    "hour": np.uint8,
    "minute": np.uint8,
    "second": np.uint8,
    "dayofyear": np.uint16,
    "day_of_year": np.uint16,
    "weekofyear": np.uint8,
    "week": np.uint8,
    "dayofweek": np.uint8,
    "day_of_week": np.uint8,
    "weekday": np.uint8,
    "quarter": np.uint8,
    "daysinmonth": np.uint8,
    "is_month_start": np.uint8,
    "is_month_end": np.uint8,
    "is_quarter_start": np.uint8,
    "is_quarter_end": np.uint8,
    "is_year_start": np.uint8,
    "is_year_end": np.uint8,
}

In [None]:
#|exporti
def _build_function_transform_name(tfm: Callable, lag: int, *args) -> str:
    """Creates a name for a transformation based on `lag`, the name of the function and its arguments."""
    tfm_name = f"{tfm.__name__}_lag{lag}"
    func_params = inspect.signature(tfm).parameters
    func_args = list(func_params.items())[1:]  # remove input array argument
    changed_params = [
        f"{name}{value}"
        for value, (name, arg) in zip(args, func_args)
        if arg.default != value
    ]
    if changed_params:
        tfm_name += "_" + "_".join(changed_params)
    return tfm_name

In [None]:
#|hide
test_eq(_build_function_transform_name(expanding_mean, 1), 'expanding_mean_lag1')
test_eq(_build_function_transform_name(rolling_mean, 2, 7), 'rolling_mean_lag2_window_size7')

In [None]:
#| exporti
def _build_lag_transform_name(tfm: _BaseLagTransform, lag: int) -> str:
    return tfm._get_name(lag)

In [None]:
#| hide
test_eq(_build_lag_transform_name(ExpandingMean(), 1), 'expanding_mean_lag1')
test_eq(_build_lag_transform_name(RollingMean(7), 2), 'rolling_mean_lag2_window_size7')

In [None]:
#| exporti
def _build_transform_name(
    tfm: Union[Callable, _BaseLagTransform], lag: int, *args
) -> str:
    if callable(tfm):
        name = _build_function_transform_name(tfm, lag, *args)
    else:
        name = _build_lag_transform_name(tfm, lag)
    return name

In [None]:
#|exporti
def _get_model_name(model) -> str:
    if isinstance(model, Pipeline):
        return _get_model_name(model.steps[-1][1])
    return model.__class__.__name__

def _name_models(current_names):
    ctr = Counter(current_names)
    if not ctr:
        return []
    if max(ctr.values()) < 2:
        return current_names
    names = current_names.copy()
    for i, x in enumerate(reversed(current_names), start=1):
        count = ctr[x]
        if count > 1:
            name = f"{x}{count}"
            ctr[x] -= 1
        else:
            name = x
        names[-i] = name
    return names

In [None]:
#|hide
# one duplicate
names = ['a', 'b', 'a', 'c']
expected = ['a', 'b', 'a2', 'c']
actual = _name_models(names)
assert actual == expected

# no duplicates
names = ['a', 'b', 'c']
actual = _name_models(names)
assert actual == names

In [None]:
#| exporti
def _as_tuple(x):
    """Return a tuple from the input."""
    if isinstance(x, tuple):
        return x
    return (x,)

In [None]:
#| exporti
Freq = Union[int, str]
Lags = Iterable[int]
LagTransform = Union[Callable, Tuple[Callable, Any]]
LagTransforms = Dict[int, List[LagTransform]]
DateFeature = Union[str, Callable]
Models = Union[BaseEstimator, List[BaseEstimator], Dict[str, BaseEstimator]]
TargetTransform = Union[BaseTargetTransform, _BaseGroupedArrayTargetTransform]
Transforms = Dict[str, Union[Tuple[Any, ...], _BaseLagTransform]]

In [None]:
#| exporti
def _parse_transforms(
    lags: Lags,
    lag_transforms: LagTransforms,
    namer: Optional[Callable] = None,
) -> Transforms:
    transforms: Transforms = OrderedDict()
    if namer is None:
        namer = _build_transform_name
    for lag in lags:
        transforms[f'lag{lag}'] = Lag(lag)
    for lag in lag_transforms.keys():
        for tfm in lag_transforms[lag]:
            if isinstance(tfm, _BaseLagTransform):
                tfm_name = namer(tfm, lag)
                transforms[tfm_name] = clone(tfm)._set_core_tfm(lag)
            else:
                tfm, *args = _as_tuple(tfm)
                assert callable(tfm)
                tfm_name = namer(tfm, lag, *args)
                transforms[tfm_name] = (lag, tfm, *args)
    return transforms

In [None]:
#| export
class TimeSeries:
    """Utility class for storing and transforming time series data."""

    def __init__(
        self,
        freq: Freq,
        lags: Optional[Lags] = None,
        lag_transforms: Optional[LagTransforms] = None,
        date_features: Optional[Iterable[DateFeature]] = None,
        num_threads: int = 1,
        target_transforms: Optional[List[TargetTransform]] = None,
        lag_transforms_namer: Optional[Callable] = None,
    ):
        self.freq = freq
        if not isinstance(num_threads, int) or num_threads < 1:
            warnings.warn('Setting num_threads to 1.')
            num_threads = 1
        self.lags = [] if lags is None else list(lags)
        for lag in self.lags:
            if lag <= 0 or not isinstance(lag, int):
                raise ValueError('lags must be positive integers.')
        self.lag_transforms = {} if lag_transforms is None else lag_transforms
        for lag in self.lag_transforms.keys():
            if lag <= 0 or not isinstance(lag, int):
                raise ValueError('keys of lag_transforms must be positive integers.')
        self.date_features = [] if date_features is None else list(date_features)
        self.num_threads = num_threads
        self.target_transforms = target_transforms
        if self.target_transforms is not None:
            for tfm in self.target_transforms:
                if isinstance(tfm, _BaseGroupedArrayTargetTransform):
                    tfm.set_num_threads(num_threads)
        for feature in self.date_features:
            if callable(feature) and feature.__name__ == '<lambda>':
                raise ValueError(
                    "Can't use a lambda as a date feature because the function name gets used as the feature name."
                )
        self.lag_transforms_namer = lag_transforms_namer
        self.transforms = _parse_transforms(
            lags=self.lags, lag_transforms=self.lag_transforms, namer=lag_transforms_namer
        )
        self.ga: GroupedArray

    def _get_core_lag_tfms(self) -> Dict[str, _BaseLagTransform]:
        return {k: v for k, v in self.transforms.items() if isinstance(v, _BaseLagTransform)}

    @property
    def _date_feature_names(self):
        return [f.__name__ if callable(f) else f for f in self.date_features]
        
    @property
    def features(self) -> List[str]:
        """Names of all computed features."""
        return list(self.transforms.keys()) + self._date_feature_names

    def __repr__(self):
        return (
            f"TimeSeries(freq={self.freq}, "
            f"transforms={list(self.transforms.keys())}, "
            f"date_features={self._date_feature_names}, "
            f"num_threads={self.num_threads})"
        )

    def _fit(
        self,
        df: DataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        keep_last_n: Optional[int] = None,
    ) -> 'TimeSeries':
        """Save the series values, ids and last dates."""
        validate_format(df, id_col, time_col, target_col)
        validate_freq(df[time_col], self.freq)
        if ufp.is_nan_or_none(df[target_col]).any():
            raise ValueError(f'{target_col} column contains null values.')
        self.id_col = id_col
        self.target_col = target_col
        self.time_col = time_col
        self.keep_last_n = keep_last_n
        self.static_features = static_features
        sorted_df = df[[id_col, time_col, target_col]]
        sorted_df = ufp.copy_if_pandas(sorted_df, deep=False)
        uids, times, data, indptr, self._sort_idxs = ufp.process_df(
            df=sorted_df,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
        )
        if data.ndim == 2:
            data = data[:, 0]
        ga = GroupedArray(data, indptr)
        if isinstance(df, pd.DataFrame):
            self.uids = pd.Index(uids)
            self.last_dates = pd.Index(times)
        else:
            self.uids = uids
            self.last_dates = pl_Series(times)
        if self._sort_idxs is not None:
            self._restore_idxs: Optional[np.ndarray] = np.empty(df.shape[0], dtype=np.int32)
            self._restore_idxs[self._sort_idxs] = np.arange(df.shape[0])
            sorted_df = ufp.take_rows(sorted_df, self._sort_idxs)
        else:
            self._restore_idxs = None
        if self.target_transforms is not None:
            for tfm in self.target_transforms:
                if isinstance(tfm, _BaseGroupedArrayTargetTransform):
                    try:
                        ga = tfm.fit_transform(ga)
                    except _ShortSeriesException as exc:
                        tfm_name = tfm.__class__.__name__
                        uids = reprlib.repr(list(self.uids[exc.args]))
                        raise ValueError(
                            f"The following series are too short for the '{tfm_name}' transformation: {uids}."
                        ) from None
                    sorted_df = ufp.assign_columns(sorted_df, target_col, ga.data)
                else:
                    tfm.set_column_names(id_col, time_col, target_col)
                    sorted_df = tfm.fit_transform(sorted_df)
                    ga.data = sorted_df[target_col].to_numpy()
        to_drop = [id_col, time_col, target_col]
        if static_features is None:
            static_features = [c for c in df.columns if c not in [time_col, target_col]]
        elif id_col not in static_features:
            static_features = [id_col] + static_features
        else:  # static_features defined and contain id_col
            to_drop = [time_col, target_col]
        self.ga = ga
        series_starts = ga.indptr[:-1]
        series_ends = ga.indptr[1:] - 1
        if self._sort_idxs is not None:
            series_starts = self._sort_idxs[series_starts]
            series_ends = self._sort_idxs[series_ends]
        statics_on_starts = ufp.drop_index_if_pandas(
            ufp.take_rows(df, series_starts)[static_features]
        )
        statics_on_ends = ufp.drop_index_if_pandas(
            ufp.take_rows(df, series_ends)[static_features]
        )
        for feat in static_features:
            if (statics_on_starts[feat] != statics_on_ends[feat]).any():
                raise ValueError(
                    f'{feat} is declared as a static feature but its values change '
                    'over time. Please set the `static_features` argument to '
                    'indicate which features are static.\nIf all of your features '
                    'are dynamic please set `static_features=[]`.'
                )
        self.static_features_ = statics_on_ends
        self.features_order_ = [
            c for c in df.columns if c not in to_drop
        ] + [f for f in self.features if f not in df.columns]
        return self

    def _compute_transforms(
        self,
        transforms: Mapping[str, Union[Tuple[Any, ...], _BaseLagTransform]],
        updates_only: bool,
    ) -> Dict[str, np.ndarray]:
        """Compute the transformations defined in the constructor.

        If `self.num_threads > 1` these are computed using multithreading."""
        if self.num_threads == 1 or len(transforms) == 1:
            out = self.ga.apply_transforms(
                transforms=transforms, updates_only=updates_only
            )
        else:
            out = self.ga.apply_multithreaded_transforms(
                transforms=transforms,
                num_threads=self.num_threads,
                updates_only=updates_only,
            )
        return out
    
    def _compute_date_feature(self, dates, feature): 
        if callable(feature):
            feat_name = feature.__name__
            feat_vals = feature(dates)
        else:
            feat_name = feature
            if isinstance(dates, pd.DatetimeIndex):
                if feature in ('week', 'weekofyear'):
                    dates = dates.isocalendar()
                feat_vals = getattr(dates, feature)
            else:
                feat_vals = getattr(dates.dt, feature)()
        if isinstance(feat_vals, (pd.Index, pd.Series)):
            feat_vals = np.asarray(feat_vals)
            feat_dtype = date_features_dtypes.get(feature)
            if feat_dtype is not None:
                feat_vals = feat_vals.astype(feat_dtype)
        return feat_name, feat_vals

    def _transform(
        self,
        df: DFType,
        dropna: bool = True,
        max_horizon: Optional[int] = None,
        return_X_y: bool = False,
        as_numpy: bool = False,
    ) -> DFType:
        """Add the features to `df`.
        
        if `dropna=True` then all the null rows are dropped."""
        # we need to compute all transformations in case they save state
        features = self._compute_transforms(
            transforms=self.transforms,
            updates_only=False
        )
        # filter out the features that already exist in df to avoid overwriting them
        features = {k: v for k, v in features.items() if k not in df}
        if self._restore_idxs is not None:
            for k, v in features.items():
                features[k] = v[self._restore_idxs]

        # target
        self.max_horizon = max_horizon
        if max_horizon is None:
            target = self.ga.data
        else:
            target = self.ga.expand_target(max_horizon)
        if self._restore_idxs is not None:
            target = target[self._restore_idxs]       

        # determine rows to keep
        if dropna:
            feature_nulls = np.full(df.shape[0], False)
            for feature_vals in features.values():
                feature_nulls |= np.isnan(feature_vals)
            target_nulls = np.isnan(target)
            if target_nulls.ndim == 2:
                # target nulls for each horizon are dropped in MLForecast.fit_models
                # we just drop rows here for which all the target values are null
                target_nulls = target_nulls.all(axis=1)
            keep_rows = ~(feature_nulls | target_nulls)
            for k, v in features.items():
                features[k] = v[keep_rows]
            target = target[keep_rows]
            df = ufp.filter_with_mask(df, keep_rows)
            df = ufp.copy_if_pandas(df, deep=False)
            last_idxs = self.ga.indptr[1:] - 1
            if self._sort_idxs is not None:
                last_idxs = self._sort_idxs[last_idxs]
            last_vals_nan = ~keep_rows[last_idxs]
            if last_vals_nan.any():
                self._dropped_series: Optional[np.ndarray] = np.where(last_vals_nan)[0]                
                dropped_ids = reprlib.repr(list(self.uids[self._dropped_series]))
                warnings.warn(
                    "The following series were dropped completely "
                    f"due to the transformations and features: {dropped_ids}.\n"
                    "These series won't show up if you use `MLForecast.forecast_fitted_values()`.\n"
                    "You can set `dropna=False` or use transformations that require less samples to mitigate this"
                )
            else:
                self._dropped_series = None
        elif isinstance(df, pd.DataFrame):
            df = df.copy(deep=False)
            self._dropped_series = None

        # once we've computed the features and target we can slice the series
        if self.keep_last_n is not None:
            self.ga = self.ga.take_from_groups(slice(-self.keep_last_n, None))         
        del self._restore_idxs, self._sort_idxs

        # lag transforms
        for feat in self.transforms.keys():
            if feat in features:
                df = ufp.assign_columns(df, feat, features[feat])

        # date features
        names = [f.__name__ if callable(f) else f for f in self.date_features]
        date_features = [f for f, name in zip(self.date_features, names) if name not in df]
        if date_features:
            unique_dates = df[self.time_col].unique()
            if isinstance(df, pd.DataFrame):
                # all kinds of trickery to make this fast
                unique_dates = pd.Index(unique_dates)
                date2pos = {date: i for i, date in enumerate(unique_dates)}
                restore_idxs = df[self.time_col].map(date2pos)
                for feature in date_features:
                    feat_name, feat_vals = self._compute_date_feature(unique_dates, feature)
                    df[feat_name] = feat_vals[restore_idxs]
            elif isinstance(df, pl_DataFrame):
                exprs = []
                for feat in date_features:  # type: ignore
                    name, vals = self._compute_date_feature(pl.col(self.time_col), feat)
                    exprs.append(vals.alias(name))
                feats = unique_dates.to_frame().with_columns(*exprs)
                df = df.join(feats, on=self.time_col, how='left')

        # assemble return
        if return_X_y:
            X = df[self.features_order_]
            if as_numpy:
                X = ufp.to_numpy(X)
            return X, target
        if max_horizon is not None:
            # remove original target
            out_cols = [c for c in df.columns if c != self.target_col]
            df = df[out_cols]
            target_names = [f"{self.target_col}{i}" for i in range(max_horizon)]
            df = ufp.assign_columns(df, target_names, target)
        else:
            if isinstance(df, pd.DataFrame):
                df = _ensure_shallow_copy(df)
            df = ufp.assign_columns(df, self.target_col, target)
        return df


    def fit_transform(
        self,
        data: DFType,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        max_horizon: Optional[int] = None,
        return_X_y: bool = False,
        as_numpy: bool = False,
    ) -> Union[DFType, Tuple[DFType, np.ndarray]]:
        """Add the features to `data` and save the required information for the predictions step.
        
        If not all features are static, specify which ones are in `static_features`.
        If you don't want to drop rows with null values after the transformations set `dropna=False`
        If `keep_last_n` is not None then that number of observations is kept across all series for updates.
        """
        self.dropna = dropna
        self.as_numpy = as_numpy
        self._fit(
            df=data,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            static_features=static_features,
            keep_last_n=keep_last_n,
        )
        return self._transform(
            df=data,
            dropna=dropna,
            max_horizon=max_horizon,
            return_X_y=return_X_y,
            as_numpy=as_numpy,
        )

    def _update_y(self, new: np.ndarray) -> None:
        """Appends the elements of `new` to every time serie.

        These values are used to update the transformations and are stored as predictions."""
        if not hasattr(self, 'y_pred'):
            self.y_pred = []
        self.y_pred.append(new)
        new_arr = np.asarray(new)
        self.ga = self.ga.append(new_arr)
        
    def _update_features(self) -> DataFrame:
        """Compute the current values of all the features using the latest values of the time series."""
        self.curr_dates: Union[pd.Index, pl_Series] = ufp.offset_times(self.curr_dates, self.freq, 1)
        self.test_dates.append(self.curr_dates)
 
        features = self._compute_transforms(self.transforms, updates_only=True)
 
        for feature in self.date_features:
            feat_name, feat_vals = self._compute_date_feature(self.curr_dates, feature)
            features[feat_name] = feat_vals

        if isinstance(self.last_dates, pl_Series):
            df_constructor = pl_DataFrame
        else:
            df_constructor = pd.DataFrame
        features_df = df_constructor(features)[self.features]
        return ufp.horizontal_concat([self.static_features_, features_df])

    def _get_raw_predictions(self) -> np.ndarray:
        return np.array(self.y_pred).ravel('F')

    def _get_future_ids(self, h: int):
        if isinstance(self.uids, pl_Series):
            uids = pl.concat([self.uids for _ in range(h)]).sort()
        else:
            uids = pd.Series(
                np.repeat(self.uids, h), name=self.id_col, dtype=self.uids.dtype
            )
        return uids

    def _get_predictions(self) -> DataFrame:
        """Get all the predicted values with their corresponding ids and datestamps."""
        h = len(self.y_pred)
        if isinstance(self.uids, pl_Series):
            df_constructor = pl_DataFrame
        else:
            df_constructor = pd.DataFrame
        uids = self._get_future_ids(h)
        df = df_constructor(
            {
                self.id_col: uids,
                self.time_col: np.array(self.test_dates).ravel('F'),
                f'{self.target_col}_pred': self._get_raw_predictions(),
            },
        )
        return df

    def _get_features_for_next_step(self, X_df=None):
        new_x = self._update_features()
        if X_df is not None:
            n_series = len(self.uids)
            h = X_df.shape[0] // n_series
            rows = np.arange(self._h, X_df.shape[0], h)
            X = ufp.take_rows(X_df, rows)
            X = ufp.drop_index_if_pandas(X)
            new_x = ufp.horizontal_concat([new_x, X])
        if isinstance(new_x, pd.DataFrame):
            nulls = new_x.isnull().any()
            cols_with_nulls = nulls[nulls].index.tolist()
        else:
            nulls = new_x.select(pl.all().is_null().any())
            cols_with_nulls = [k for k, v in nulls.to_dicts()[0].items() if v]
        if cols_with_nulls:
            warnings.warn(
                f'Found null values in {", ".join(cols_with_nulls)}.'
            )
        self._h += 1
        new_x = new_x[self.features_order_]
        if self.as_numpy:
            new_x = ufp.to_numpy(new_x)
        return new_x

    @contextmanager
    def _backup(self) -> Iterator[None]:
        # this gets modified during predict because the predictions are appended
        ga = copy.copy(self.ga)
        # if these save state (like ExpandingMean) they'll get modified by the updates
        lag_tfms = copy.deepcopy(self.transforms)
        try:
            yield
        finally:
            self.ga = ga
            self.transforms = lag_tfms

    def _predict_setup(self) -> None:
        # TODO: move to utils
        if isinstance(self.last_dates, pl_Series):
            self.curr_dates  = self.last_dates.clone()
        else:
            self.curr_dates  = self.last_dates.copy()
        self.test_dates: List[Union[pd.Index, pl_Series]] = []
        self.y_pred = []
        self._h = 0

    def _predict_recursive(
        self,
        models: Dict[str, BaseEstimator],
        horizon: int,
        before_predict_callback: Optional[Callable] = None,
        after_predict_callback: Optional[Callable] = None,
        X_df: Optional[DFType] = None,
    ) -> DFType:
        """Use `model` to predict the next `horizon` timesteps."""
        for i, (name, model) in enumerate(models.items()):
            with self._backup():
                self._predict_setup()
                for _ in range(horizon):
                    new_x = self._get_features_for_next_step(X_df)
                    if before_predict_callback is not None:
                        new_x = before_predict_callback(new_x)
                    predictions = model.predict(new_x)
                    if after_predict_callback is not None:
                        predictions = after_predict_callback(predictions)
                    self._update_y(predictions)
                if i == 0:
                    preds = self._get_predictions()
                    rename_dict = {f'{self.target_col}_pred': name}
                    preds = ufp.rename(preds, rename_dict)
                else:
                    raw_preds = self._get_raw_predictions()
                    preds = ufp.assign_columns(preds, name, raw_preds)
        return preds

    def _predict_multi(
        self,
        models: Dict[str, BaseEstimator],
        horizon: int,
        before_predict_callback: Optional[Callable] = None,
        X_df: Optional[DFType] = None,
    ) -> DFType:
        assert self.max_horizon is not None
        if horizon > self.max_horizon:
            raise ValueError(f'horizon must be at most max_horizon ({self.max_horizon})')
        self._predict_setup()
        uids = self._get_future_ids(horizon)
        starts = ufp.offset_times(self.curr_dates, self.freq, 1)
        dates = ufp.time_ranges(starts, self.freq, periods=horizon)
        if isinstance(self.curr_dates, pl_Series):
            df_constructor = pl_DataFrame
        else:
            df_constructor = pd.DataFrame
        result = df_constructor({self.id_col: uids, self.time_col: dates})
        for name, model in models.items():
            with self._backup():
                new_x = self._get_features_for_next_step(X_df)
                if before_predict_callback is not None:
                    new_x = before_predict_callback(new_x)
                predictions = np.empty((new_x.shape[0], horizon))
                for i in range(horizon):
                    predictions[:, i] = model[i].predict(new_x)
                raw_preds = predictions.ravel()
                result = ufp.assign_columns(result, name, raw_preds)
        return result

    def _has_ga_target_tfms(self):
        return any(isinstance(tfm, _BaseGroupedArrayTargetTransform) for tfm in self.target_transforms)

    @contextmanager
    def _maybe_subset(self, idxs: Optional[np.ndarray]) -> Iterator[None]:
        # save original
        ga = self.ga
        uids = self.uids
        statics = self.static_features_
        last_dates = self.last_dates
        targ_tfms = copy.copy(self.target_transforms)
        lag_tfms = copy.deepcopy(self.transforms)

        if idxs is not None:
            # assign subsets
            self.ga = self.ga.take(idxs)            
            self.uids = uids[idxs]
            self.static_features_ = ufp.take_rows(statics, idxs)
            self.static_features_ = ufp.drop_index_if_pandas(self.static_features_)
            self.last_dates = last_dates[idxs]
            if self.target_transforms is not None:
                for i, tfm in enumerate(self.target_transforms):
                    if isinstance(tfm, _BaseGroupedArrayTargetTransform):
                        self.target_transforms[i] = tfm.take(idxs)
            for name, lag_tfm in self.transforms.items():
                if isinstance(lag_tfm, _BaseLagTransform):
                    lag_tfm = lag_tfm.take(idxs)
                self.transforms[name] = lag_tfm
        try:
            yield
        finally:
            self.ga = ga
            self.uids = uids
            self.static_features_ = statics
            self.last_dates = last_dates
            self.target_transforms = targ_tfms
            self.lag_tfms = lag_tfms

    def predict(
        self,
        models: Dict[str, Union[BaseEstimator, List[BaseEstimator]]],
        horizon: int,
        before_predict_callback: Optional[Callable] = None,
        after_predict_callback: Optional[Callable] = None,
        X_df: Optional[DFType] = None,
        ids: Optional[List[str]] = None,
    ) -> DFType:
        if ids is not None:
            unseen = set(ids) - set(self.uids)
            if unseen:
                raise ValueError(f"The following ids weren't seen during training and thus can't be forecasted: {unseen}")
            idxs: Optional[np.ndarray] = np.where(ufp.is_in(self.uids, ids))[0]
        else:
            idxs = None
        with self._maybe_subset(idxs):
            if X_df is not None:
                if self.id_col not in X_df or self.time_col not in X_df:
                    raise ValueError(f"X_df must have '{self.id_col}' and '{self.time_col}' columns.")
                if X_df.shape[1] < 3:
                    raise ValueError("Found no exogenous features in `X_df`.")
                statics = [c for c in self.static_features_.columns if c != self.id_col]
                dynamics = [c for c in X_df.columns if c not in [self.id_col, self.time_col]]
                common = [c for c in dynamics if c in statics]
                if common:
                    raise ValueError(
                        f"The following features were provided through `X_df` but were considered as static during fit: {common}.\n"
                        "Please re-run the fit step using the `static_features` argument to indicate which features are static. "
                        "If all your features are dynamic please pass an empty list (static_features=[])."
                    )
                starts = ufp.offset_times(self.last_dates, self.freq, 1)
                ends = ufp.offset_times(self.last_dates, self.freq, horizon)
                dates_validation = type(X_df)(
                    {
                        self.id_col: self.uids,
                        '_start': starts,
                        '_end': ends,
                    }
                )
                X_df = ufp.join(X_df, dates_validation, on=self.id_col)
                mask = ufp.between(X_df[self.time_col], X_df['_start'], X_df['_end'])
                X_df = ufp.filter_with_mask(X_df, mask)
                if X_df.shape[0] != len(self.uids) * horizon:
                    msg = (
                        "Found missing inputs in X_df. "
                        "It should have one row per id and time for the complete forecasting horizon.\n"
                        "You can get the expected structure by running `MLForecast.make_future_dataframe(h)` "
                        "or get the missing combinatins in your current `X_df` by running `MLForecast.get_missing_future(h, X_df)`."
                    ) 
                    raise ValueError(msg)
                drop_cols = [self.id_col, self.time_col, '_start', '_end']
                X_df = ufp.sort(X_df, [self.id_col, self.time_col])
                X_df = ufp.drop_columns(X_df, drop_cols)
            if getattr(self, 'max_horizon', None) is None:
                preds = self._predict_recursive(
                    models=models,
                    horizon=horizon,
                    before_predict_callback=before_predict_callback,
                    after_predict_callback=after_predict_callback,
                    X_df=X_df,
                )
            else:
                preds = self._predict_multi(
                    models=models,
                    horizon=horizon,
                    before_predict_callback=before_predict_callback,
                    X_df=X_df,
                )
            if self.target_transforms is not None:
                if self._has_ga_target_tfms():
                    model_cols = [c for c in preds.columns if c not in (self.id_col, self.time_col)]
                    indptr = np.arange(0, horizon * (len(self.uids) + 1), horizon)
                for tfm in self.target_transforms[::-1]:
                    if isinstance(tfm, _BaseGroupedArrayTargetTransform):
                        for col in model_cols:
                            ga = GroupedArray(preds[col].to_numpy().astype(self.ga.data.dtype), indptr)
                            ga = tfm.inverse_transform(ga)
                            preds = ufp.assign_columns(preds, col, ga.data)
                    else:
                        preds = tfm.inverse_transform(preds)
        return preds

    def save(self, path: Union[str, Path]) -> None:
        with fsspec.open(path, 'wb') as f:
            cloudpickle.dump(self, f)

    @staticmethod
    def load(
        path: Union[str, Path], protocol: Optional[str] = None
    ) -> 'TimeSeries':
        with fsspec.open(path, 'rb', protocol=protocol) as f:
            ts = cloudpickle.load(f)
        return ts

    def update(self, df: DataFrame) -> None:
        """Update the values of the stored series."""
        validate_format(df, self.id_col, self.time_col, self.target_col)
        uids = self.uids
        if isinstance(uids, pd.Index):
            uids = pd.Series(uids)
        uids, new_ids = ufp.match_if_categorical(uids, df[self.id_col])
        df = ufp.copy_if_pandas(df, deep=False)
        df = ufp.assign_columns(df, self.id_col, new_ids)
        df = ufp.sort(df, by=[self.id_col, self.time_col])
        values = df[self.target_col].to_numpy()
        values = values.astype(self.ga.data.dtype, copy=False)
        id_counts = ufp.counts_by_id(df, self.id_col)
        try:
            sizes = ufp.join(uids, id_counts, on=self.id_col, how='outer_coalesce')
        except (KeyError, ValueError):
            # pandas raises key error, polars before coalesce raises value error
            sizes = ufp.join(uids, id_counts, on=self.id_col, how='outer')
        sizes = ufp.fill_null(sizes, {'counts': 0})
        sizes = ufp.sort(sizes, by=self.id_col)
        new_groups = ~ufp.is_in(sizes[self.id_col], uids)
        last_dates = ufp.group_by_agg(df, self.id_col, {self.time_col: 'max'})
        last_dates = ufp.join(sizes, last_dates, on=self.id_col, how='left')
        curr_last_dates = type(df)({self.id_col: uids, '_curr': self.last_dates})
        last_dates = ufp.join(last_dates, curr_last_dates, on=self.id_col, how='left')
        last_dates = ufp.fill_null(last_dates, {self.time_col: last_dates['_curr']})
        last_dates = ufp.sort(last_dates, by=self.id_col)
        self.last_dates = ufp.cast(last_dates[self.time_col], self.last_dates.dtype)
        self.uids = ufp.sort(sizes[self.id_col])
        if isinstance(df, pd.DataFrame):
            self.uids = pd.Index(self.uids)
            self.last_dates = pd.Index(self.last_dates)
        if new_groups.any():
            if self.target_transforms is not None:
                raise ValueError('Can not update target_transforms with new series.')
            new_ids = ufp.filter_with_mask(sizes[self.id_col], new_groups)
            new_ids_df = ufp.filter_with_mask(df, ufp.is_in(df[self.id_col], new_ids))
            new_ids_counts = ufp.counts_by_id(new_ids_df, self.id_col)
            new_statics = ufp.take_rows(df, new_ids_counts["counts"].to_numpy().cumsum() - 1)
            new_statics = new_statics[self.static_features_.columns]
            self.static_features_ = ufp.vertical_concat([self.static_features_, new_statics])
            self.static_features_ = ufp.sort(self.static_features_, self.id_col)
        if self.target_transforms is not None:
            if self._has_ga_target_tfms():            
                indptr = np.append(0, id_counts['counts']).cumsum()
            for tfm in self.target_transforms:
                if isinstance(tfm, _BaseGroupedArrayTargetTransform):
                    ga = GroupedArray(values, indptr)
                    ga = tfm.update(ga)
                    df = ufp.assign_columns(df, self.target_col, ga.data)
                else:
                    df = tfm.update(df)
                values = df[self.target_col].to_numpy()                    
        self.ga = self.ga.append_several(
            new_sizes=sizes['counts'].to_numpy().astype(np.int32),
            new_values=values,
            new_groups=new_groups.to_numpy(),
        )

In [None]:
#| hide
test_fail(lambda: TimeSeries(freq='D', lags=list(range(2))), contains='lags must be positive integers')
test_fail(lambda: TimeSeries(freq='D', lag_transforms={0: 1}), contains='keys of lag_transforms must be positive integers')

In [None]:
#|hide
# differences
n = 7 * 14
x = pd.DataFrame(
    {
        'id': np.repeat(0, n),
        'ds': np.arange(n),
        'y': np.arange(7)[[x % 7 for x in np.arange(n)]]
    },
)
x['y'] = x['ds'] * 0.1 + x['y']
ts = TimeSeries(freq=1, target_transforms=[Differences([1, 7])])
ts._fit(x.iloc[:-14], id_col='id', time_col='ds', target_col='y')
ts.as_numpy = False
np.testing.assert_allclose(
    x['y'].diff(1).diff(7).values[:-14],
    ts.ga.data,
)
ts.y_pred = np.zeros(14)
class A:
    def fit(self, X):
        return self
    def predict(self, X):
        return np.zeros(X.shape[0])
xx = ts.predict({'A': A()}, 14)
np.testing.assert_allclose(xx['A'], x['y'].tail(14).values)

In [None]:
#| hide
# tfms namer
def namer(f, lag, *args):
    return f'hello_from_{f.__name__}'

ts = TimeSeries(
    freq=1,
    lag_transforms={1: [(rolling_mean, 7), expanding_mean]},
    lag_transforms_namer=namer,
)
transformed = ts.fit_transform(x, id_col='id', time_col='ds', target_col='y')
test_eq(
    transformed.columns.tolist(),
    ['id', 'ds', 'y', 'hello_from_rolling_mean', 'hello_from_expanding_mean'],
)

In [None]:
#| hide
test_fail(lambda: TimeSeries(freq=1, date_features=[lambda: 1]), contains="Can't use a lambda")

The `TimeSeries` class takes care of defining the transformations to be performed (`lags`, `lag_transforms` and `date_features`). The transformations can be computed using multithreading if `num_threads > 1`.

In [None]:
def month_start_or_end(dates):
    return dates.is_month_start | dates.is_month_end

flow_config = dict(
    freq='W-THU',
    lags=[7],
    lag_transforms={
        1: [expanding_mean, (rolling_mean, 7)]
    },
    date_features=['dayofweek', 'week', month_start_or_end]
)

ts = TimeSeries(**flow_config)
ts

TimeSeries(freq=W-THU, transforms=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag1_window_size7'], date_features=['dayofweek', 'week', 'month_start_or_end'], num_threads=1)

In [None]:
#| hide
test_eq(
    TimeSeries(freq=ts.freq).freq,
    TimeSeries(freq='W-THU').freq
)

The frequency is converted to an offset.

In [None]:
test_eq(ts.freq, pd.tseries.frequencies.to_offset(flow_config['freq']))

The date features are stored as they were passed to the constructor.

In [None]:
test_eq(ts.date_features, flow_config['date_features'])

The transformations are stored as a dictionary where the key is the name of the transformation (name of the column in the dataframe with the computed features), which is built using `build_transform_name` and the value is a tuple where the first element is the lag it is applied to, then the function and then the function arguments.

In [None]:
test_eq(
    ts.transforms, 
    {
        'lag7': Lag(7),
        'expanding_mean_lag1': (1, expanding_mean), 
        'rolling_mean_lag1_window_size7': (1, rolling_mean, 7)
        
    }
)

Note that for `lags` we define the transformation as the identity function applied to its corresponding lag. This is because `_transform_series` takes the lag as an argument and shifts the array before computing the transformation.

In [None]:
#|hide
# int y is converted to float32
serie2 = serie.copy()
serie2['y'] = serie2['y'].astype(int)
ts = TimeSeries(num_threads=1, freq='D')
ts._fit(serie2, id_col='unique_id', time_col='ds', target_col='y')
test_eq(ts.ga.data.dtype, np.float32)

In [None]:
#|hide
# _compute_transforms
y = serie.y.values
lag_1 = shift_array(y, 1)

for num_threads in (1, 2):
    ts = TimeSeries(**flow_config)
    ts._fit(serie, id_col='unique_id', time_col='ds', target_col='y')
    transforms = ts._compute_transforms(ts.transforms, updates_only=False)

    np.testing.assert_equal(transforms['lag7'], shift_array(y, 7))
    np.testing.assert_equal(transforms['expanding_mean_lag1'], expanding_mean(lag_1))
    np.testing.assert_equal(transforms['rolling_mean_lag1_window_size7'], rolling_mean(lag_1, 7))

In [None]:
#|hide
# update_y
ts = TimeSeries(freq='D', lags=[1])
ts._fit(serie, id_col='unique_id', time_col='ds', target_col='y')

max_size = np.diff(ts.ga.indptr)
ts._update_y([1])
ts._update_y([2])

test_eq(np.diff(ts.ga.indptr), max_size + 2)
test_eq(ts.ga.data[-2:], [1, 2])

In [None]:
#|hide
# _update_features
ts = TimeSeries(**flow_config)
ts._fit(serie, id_col='unique_id', time_col='ds', target_col='y')
ts._predict_setup()
updates = ts._update_features()

last_date = serie['ds'].max()
first_prediction_date = last_date + pd.offsets.Day()

# these have an offset becase we can now "see" our last y value
expected = pd.DataFrame({
    'unique_id': ts.uids,
    'lag7': shift_array(y, 6)[-1],
    'expanding_mean_lag1': expanding_mean(y)[-1],
    'rolling_mean_lag1_window_size7': rolling_mean(y, 7)[-1],
    'dayofweek': np.uint8([getattr(first_prediction_date, 'dayofweek')]),
    'week': np.uint8([first_prediction_date.isocalendar()[1]]),
    'month_start_or_end': month_start_or_end(first_prediction_date)
})
statics = serie.tail(1).drop(columns=['ds', 'y'])
pd.testing.assert_frame_equal(updates, statics.merge(expected))


test_eq(ts.curr_dates[0], first_prediction_date)

In [None]:
#|hide
# _get_predictions
ts = TimeSeries(freq='D', lags=[1])
ts._fit(serie, id_col='unique_id', time_col='ds', target_col='y')
ts._predict_setup()
ts._update_features()
ts._update_y([1.])
preds = ts._get_predictions()

last_ds = serie['ds'].max()
expected = pd.DataFrame({'unique_id': serie['unique_id'][[0]], 'ds': [last_ds + pd.offsets.Day()], 'y_pred': [1.]})
pd.testing.assert_frame_equal(preds, expected)

In [None]:
show_doc(TimeSeries.fit_transform, title_level=2)

---

[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/core.py#L486){target="_blank" style="float:right; font-size:smaller"}

## TimeSeries.fit_transform

>      TimeSeries.fit_transform (data:~DFType, id_col:str, time_col:str,
>                                target_col:str,
>                                static_features:Optional[List[str]]=None,
>                                dropna:bool=True,
>                                keep_last_n:Optional[int]=None,
>                                max_horizon:Optional[int]=None,
>                                return_X_y:bool=False, as_numpy:bool=False)

*Add the features to `data` and save the required information for the predictions step.

If not all features are static, specify which ones are in `static_features`.
If you don't want to drop rows with null values after the transformations set `dropna=False`
If `keep_last_n` is not None then that number of observations is kept across all series for updates.*

In [None]:
flow_config = dict(
    freq='D',
    lags=[7, 14],
    lag_transforms={
        2: [
            (rolling_mean, 7),
            (rolling_mean, 14),
        ]
    },
    date_features=['dayofweek', 'month', 'year'],
    num_threads=2
)

ts = TimeSeries(**flow_config)
_ = ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y')

The series values are stored as a GroupedArray in an attribute `ga`. If the data type of the series values is an int then it is converted to `np.float32`, this is because lags generate `np.nan`s so we need a float data type for them.

In [None]:
np.testing.assert_equal(ts.ga.data, series.y.values)

The series ids are stored in an `uids` attribute.

In [None]:
test_eq(ts.uids, series['unique_id'].unique())

For each time serie, the last observed date is stored so that predictions start from the last date + the frequency.

In [None]:
test_eq(ts.last_dates, series.groupby('unique_id', observed=True)['ds'].max().values)

The last row of every serie without the `y` and `ds` columns are taken as static features.

In [None]:
pd.testing.assert_frame_equal(
    ts.static_features_,
    series.groupby('unique_id', observed=True).tail(1).drop(columns=['ds', 'y']).reset_index(drop=True),
)

If you pass `static_features` to `TimeSeries.fit_transform` then only these are kept.

In [None]:
ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y', static_features=['static_0'])

pd.testing.assert_frame_equal(
    ts.static_features_,
    series.groupby('unique_id', observed=True).tail(1)[['unique_id', 'static_0']].reset_index(drop=True),
)

You can also specify keep_last_n in TimeSeries.fit_transform, which means that after computing the features for training we want to keep only the last n samples of each time serie for computing the updates. This saves both memory and time, since the updates are performed by running the transformation functions on all time series again and keeping only the last value (the update).

If you have very long time series and your updates only require a small sample it's recommended that you set keep_last_n to the minimum number of samples required to compute the updates, which in this case is 15 since we have a rolling mean of size 14 over the lag 2 and in the first update the lag 2 becomes the lag 1. This is because in the first update the lag 1 is the last value of the series (or the lag 0), the lag 2 is the lag 1 and so on.

In [None]:
keep_last_n = 15

ts = TimeSeries(**flow_config)
df = ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y', keep_last_n=keep_last_n)
ts._predict_setup()

expected_lags = ['lag7', 'lag14']
expected_transforms = ['rolling_mean_lag2_window_size7', 
                       'rolling_mean_lag2_window_size14']
expected_date_features = ['dayofweek', 'month', 'year']

test_eq(ts.features, expected_lags + expected_transforms + expected_date_features)
test_eq(ts.static_features_.columns.tolist() + ts.features, df.columns.drop(['ds', 'y']).tolist())
# we dropped 2 rows because of the lag 2 and 13 more to have the window of size 14
test_eq(df.shape[0], series.shape[0] - (2 + 13) * ts.ga.n_groups)
test_eq(ts.ga.data.size, ts.ga.n_groups * keep_last_n)

`TimeSeries.fit_transform` requires that the *y* column doesn't have any null values. This is because the transformations could propagate them forward, so if you have null values in the *y* column you'll get an error.

In [None]:
series_with_nulls = series.copy()
series_with_nulls.loc[1, 'y'] = np.nan
test_fail(
    lambda: ts.fit_transform(series_with_nulls, id_col='unique_id', time_col='ds', target_col='y'),
    contains='y column contains null values'
)

In [None]:
#|hide
# unsorted df
ts = TimeSeries(**flow_config)
df = ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y')
unordered_series = series.sample(frac=1.0)
assert not unordered_series.set_index('ds', append=True).index.is_monotonic_increasing
df2 = ts.fit_transform(unordered_series, id_col='unique_id', time_col='ds', target_col='y')
pd.testing.assert_frame_equal(
    df.reset_index(drop=True),
    df2.sort_values(['unique_id', 'ds']).reset_index(drop=True)
)

In [None]:
#| hide
# existing features arent recomputed
df_with_features = pd.DataFrame({
    'unique_id': [1, 1, 1],
    'ds': pd.date_range('2000-01-01', freq='D', periods=3),
    'y': [10., 11., 12.],
    'lag1': [1, 1, 1],
    'month': [12, 12, 12],
    
})
ts = TimeSeries(freq='D', lags=[1, 2], date_features=['year', 'month'])
transformed = ts.fit_transform(df_with_features, id_col='unique_id', time_col='ds', target_col='y', dropna=False)
pd.testing.assert_series_equal(transformed['lag1'], df_with_features['lag1'])
pd.testing.assert_series_equal(transformed['month'], df_with_features['month'])
np.testing.assert_array_equal(transformed['year'], 3 * [2000])
np.testing.assert_array_equal(transformed['lag2'].values, [np.nan, np.nan, 10.])

In [None]:
#|hide
# non-standard df
ts = TimeSeries(**flow_config)
df = ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y')
non_std_series = series.reset_index().rename(columns={'unique_id': 'some_id', 'ds': 'timestamp', 'y': 'value'})
non_std_res = ts.fit_transform(
    non_std_series, id_col='some_id', time_col='timestamp', target_col='value', static_features=[]
)
non_std_res = non_std_res.reset_index(drop=True)
pd.testing.assert_frame_equal(
    df.reset_index(),
    non_std_res.rename(columns={'timestamp': 'ds', 'value': 'y', 'some_id': 'unique_id'})
)

In [None]:
#|hide
# integer timestamps
def identity(x):
    return x

flow_config_int_ds = copy.deepcopy(flow_config)
flow_config_int_ds['date_features'] = [identity]
flow_config_int_ds['freq'] = 1
ts = TimeSeries(**flow_config_int_ds)
int_ds_series = series.copy()
int_ds_series['ds'] = int_ds_series['ds'].astype('int64')
int_ds_res = ts.fit_transform(int_ds_series, id_col='unique_id', time_col='ds', target_col='y')
int_ds_res['ds'] = pd.to_datetime(int_ds_res['ds'])
int_ds_res['identity'] = pd.to_datetime(int_ds_res['ds'])
df2 = df.drop(columns=flow_config['date_features'])
df2['identity'] = df2['ds']
pd.testing.assert_frame_equal(df2, int_ds_res)

In [None]:
show_doc(TimeSeries.predict, title_level=2)

Once we have a trained model we can use `TimeSeries.predict` passing the model and the horizon to get the predictions back.

In [None]:
class DummyModel:
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        return X['lag7'].values

horizon = 7
model = DummyModel()
ts = TimeSeries(**flow_config)
ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y')
predictions = ts.predict({'DummyModel': model}, horizon)

grouped_series = series.groupby('unique_id', observed=True)
expected_preds = grouped_series['y'].tail(7)  # the model predicts the lag-7
last_dates = grouped_series['ds'].max()
expected_dsmin = last_dates + pd.offsets.Day()
expected_dsmax = last_dates + horizon * pd.offsets.Day()
grouped_preds = predictions.groupby('unique_id', observed=True)

np.testing.assert_allclose(predictions['DummyModel'], expected_preds)
pd.testing.assert_series_equal(grouped_preds['ds'].min(), expected_dsmin)
pd.testing.assert_series_equal(grouped_preds['ds'].max(), expected_dsmax)

In [None]:
#|hide
model = DummyModel()
ts = TimeSeries(**flow_config)
ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y')
predictions = ts.predict({'DummyModel': model}, horizon=horizon)
ts = TimeSeries(**flow_config_int_ds)
ts.fit_transform(int_ds_series, id_col='unique_id', time_col='ds', target_col='y')
int_ds_predictions = ts.predict({'DummyModel': model}, horizon=horizon)
pd.testing.assert_frame_equal(predictions.drop(columns='ds'), int_ds_predictions.drop(columns='ds'))

If we have dynamic features we can pass them to `X_df`.

In [None]:
class PredictPrice:
    def predict(self, X):
        return X['price']

series = generate_daily_series(20, n_static_features=2, equal_ends=True)
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
series_with_prices = dynamic_series.merge(prices_catalog, how='left')

model = PredictPrice()
ts = TimeSeries(**flow_config)
ts.fit_transform(
    series_with_prices,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=['static_0', 'product_id'],
)
predictions = ts.predict({'PredictPrice': model}, horizon=1, X_df=prices_catalog)
pd.testing.assert_frame_equal(
    predictions.rename(columns={'PredictPrice': 'price'}),
    prices_catalog.merge(predictions[['unique_id', 'ds']])[['unique_id', 'ds', 'price']]
)

In [None]:
#| hide
# predicting for a subset
sample_ids = ['id_00', 'id_16']
sample_preds = ts.predict({'price': model}, 1, X_df=prices_catalog, ids=sample_ids)
pd.testing.assert_frame_equal(
    sample_preds,
    prices_catalog.merge(predictions[predictions['unique_id'].isin(sample_ids)][['unique_id', 'ds']])[['unique_id', 'ds', 'price']]
)
test_fail(lambda: ts.predict({'y': model}, 1, ids=['bonjour']), contains="{'bonjour'}")

In [None]:
show_doc(TimeSeries.update, title_level=2)

In [None]:
#| hide
class SeasonalNaiveModel:
    def predict(self, X):
        return X['lag7']

class NaiveModel:
    def predict(self, X: pd.DataFrame):
        return X['lag1']

two_series = series[series['unique_id'].isin(['id_00', 'id_19'])].copy()
two_series['unique_id'] = pd.Categorical(two_series['unique_id'], ['id_00', 'id_19'])
ts = TimeSeries(freq='D', lags=[1], date_features=['dayofweek'])
ts.fit_transform(
    two_series,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
)
last_vals_two_series = two_series.groupby('unique_id', observed=True).tail(1)
last_val_id0 = last_vals_two_series[lambda x: x['unique_id'].eq('id_00')].copy()
new_values = last_val_id0.copy()
new_values['ds'] += pd.offsets.Day()
new_serie = pd.DataFrame({
    'unique_id': ['new_idx', 'new_idx'],
    'ds': pd.to_datetime(['2020-01-01', '2020-01-02']),
    'y': [5.0, 6.0],
    'static_0': [0, 0],
    'static_1': [1, 1],
})
new_values = pd.concat([new_values, new_serie])
ts.update(new_values)
preds = ts.predict({'Naive': NaiveModel()}, 1)
expected_id0 = last_val_id0.copy()
expected_id0['ds'] += pd.offsets.Day()
expected_id1 = last_vals_two_series[lambda x: x['unique_id'].eq('id_19')].copy()
last_val_new_serie = new_serie.tail(1)[['unique_id', 'ds', 'y']]
expected = pd.concat([expected_id0, expected_id1, last_val_new_serie])
expected = expected[['unique_id', 'ds', 'y']]
expected = expected.rename(columns={'y': 'Naive'}).reset_index(drop=True)
expected['unique_id'] = pd.Categorical(expected['unique_id'], categories=['id_00', 'id_19', 'new_idx'])
expected['ds'] += pd.offsets.Day()
pd.testing.assert_frame_equal(preds, expected)
pd.testing.assert_frame_equal(
    ts.static_features_,
    (
        pd.concat([last_vals_two_series, new_serie.tail(1)])
        [['unique_id', 'static_0', 'static_1']]
        .astype(ts.static_features_.dtypes)
        .reset_index(drop=True)
    )
)
# with target transforms
ts = TimeSeries(
    freq='D',
    lags=[7],
    target_transforms=[Differences([1, 2]), LocalStandardScaler()],
)
ts.fit_transform(two_series, id_col='unique_id', time_col='ds', target_col='y')
new_values = two_series.groupby('unique_id', observed=True).tail(7).copy()
new_values['ds'] += 7 * pd.offsets.Day()
orig_last7 = ts.ga.take_from_groups(slice(-7, None)).data
ts.update(new_values)
preds = ts.predict({'SeasonalNaive': SeasonalNaiveModel()}, 7)
np.testing.assert_allclose(
    new_values['y'].values,
    preds['SeasonalNaive'].values,
)
last7 = ts.ga.take_from_groups(slice(-7, None)).data
assert 0 < np.abs(last7 / orig_last7 - 1).mean() < 0.5

In [None]:
#| hide
#| polars
two_series = generate_daily_series(2, n_static_features=2, engine='polars')
ts = TimeSeries(freq='1d', lags=[1], date_features=['weekday'])
ts.fit_transform(
    two_series,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
)
last_vals_two_series = two_series.join(
    two_series.group_by('unique_id').agg(pl.col('ds').max()), on=['unique_id', 'ds']
)
last_val_id0 = last_vals_two_series.filter(pl.col('unique_id') == 'id_0')
new_values = last_val_id0.with_columns(
    pl.col('unique_id').cast(pl.Categorical),
    pl.col('ds').dt.offset_by('1d'),
    pl.col('static_0').cast(pl.Int64),
    pl.col('static_1').cast(pl.Int64),
)
new_serie = pl.DataFrame({
    'unique_id': ['new_idx', 'new_idx'],
    'ds': [datetime.datetime(2020, 1, 1), datetime.datetime(2020, 1, 2)],
    'y': [5.0, 6.0],
    'static_0': [0, 0],
    'static_1': [1, 1],
}).with_columns(
    pl.col('ds').dt.cast_time_unit('ns'),
    pl.col('unique_id').cast(pl.Categorical),
)
new_values = pl.concat([new_values, new_serie])
ts.update(new_values)
preds = ts.predict({'Naive': NaiveModel()}, 1)
expected_id0 = last_val_id0.with_columns(pl.col('ds').dt.offset_by('1d'))
expected_id1 = last_vals_two_series.filter(pl.col('unique_id') == 'id_1')
last_val_new_serie = new_serie.tail(1)
expected = pl.concat([expected_id0, expected_id1])
expected = ufp.vertical_concat([expected, last_val_new_serie])
pd.testing.assert_series_equal(
    expected['unique_id'].cat.get_categories().to_pandas(),
    pd.Series(['id_0', 'id_1', 'new_idx'], name='unique_id')
)
expected = expected[['unique_id', 'ds', 'y']]
expected = ufp.rename(expected, {'y': 'Naive'})
expected = expected.with_columns(pl.col('ds').dt.offset_by('1d'))
pd.testing.assert_frame_equal(preds.to_pandas(), expected.to_pandas())
pd.testing.assert_frame_equal(
    ts.static_features_.to_pandas(),
    (
        ufp.vertical_concat([last_vals_two_series, new_serie.tail(1)])
        [['unique_id', 'static_0', 'static_1']]
        .to_pandas()
        .astype(ts.static_features_.to_pandas().dtypes)
        .reset_index(drop=True)
    )
)
# with target transforms
ts = TimeSeries(
    freq='1d',
    lags=[7],
    target_transforms=[Differences([1, 2]), LocalStandardScaler()],
)
ts.fit_transform(two_series, id_col='unique_id', time_col='ds', target_col='y')
new_values = two_series.group_by('unique_id').tail(7)
new_values = new_values.with_columns(pl.col('ds').dt.offset_by('7d'))
orig_last7 = ts.ga.take_from_groups(slice(-7, None)).data
ts.update(new_values)
preds = ts.predict({'SeasonalNaive': SeasonalNaiveModel()}, 7)
np.testing.assert_allclose(
    new_values['y'].to_numpy(),
    preds['SeasonalNaive'].to_numpy(),
)
last7 = ts.ga.take_from_groups(slice(-7, None)).data
assert 0 < np.abs(last7 / orig_last7 - 1).mean() < 0.5

In [None]:
#| hide
# target_transform with keep_last_n
ts = TimeSeries(freq='D', lags=[1], target_transforms=[LocalStandardScaler()])
ts.fit_transform(series, id_col='unique_id', time_col='ds', target_col='y', keep_last_n=10)
preds = ts.predict({'y': NaiveModel()}, 1)
expected = series.groupby('unique_id', observed=True).tail(1)[['unique_id', 'ds', 'y']].reset_index(drop=True)
expected['ds'] += pd.offsets.Day()
pd.testing.assert_frame_equal(preds, expected)

In [None]:
#| hide
# raise error when omitting the static_features argument and passing them as dynamic in predict
valid = series.groupby('unique_id', observed=True).tail(10)
train = series.drop(valid.index)
ts = TimeSeries(freq='D', lags=[1], target_transforms=[LocalStandardScaler()])
ts.fit_transform(train, id_col='unique_id', time_col='ds', target_col='y', keep_last_n=10)
test_fail(lambda: ts.predict({'y': NaiveModel()}, 1, X_df=valid.drop(columns=['y'])), contains="['static_0', 'static_1']")

In [None]:
#| hide
#| polars
series_pl = generate_daily_series(5, static_as_categorical=False, n_static_features=5, engine='polars')
series_pd = generate_daily_series(5, static_as_categorical=False, n_static_features=5, engine='pandas')
series_pl = series_pl.with_columns(pl.col('unique_id').cast(str))
series_pd['unique_id'] = series_pd['unique_id'].astype(str)

cfg = dict(
    lags=[1, 2, 3, 4],
    lag_transforms={
        1: [expanding_mean, (rolling_mean, 7), (rolling_mean, 14)],
        2: [expanding_mean, (rolling_mean, 7), (rolling_mean, 14)],
        3: [expanding_mean, (rolling_mean, 7), (rolling_mean, 14)],
        4: [expanding_mean, (rolling_mean, 7), (rolling_mean, 14)],
    },
    date_features=['day', 'month', 'quarter', 'year'],
    target_transforms=[Differences([1])],
)
feats_pl = SaveFeatures()
ts_pl = TimeSeries(freq='1d', **cfg)
prep_pl = ts_pl.fit_transform(series_pl, 'unique_id', 'ds', 'y')
fcst_pl = ts_pl.predict({'y': NaiveModel()}, 2, before_predict_callback=feats_pl)

feats_pd = SaveFeatures()
ts_pd = TimeSeries(freq='1D', **cfg)
prep_pd = ts_pd.fit_transform(series_pd, 'unique_id', 'ds', 'y')
fcst_pd = ts_pd.predict({'y': NaiveModel()}, 2, before_predict_callback=feats_pd)

prep_pd = prep_pd.reset_index(drop=True)
prep_pl = prep_pl.to_pandas()
fcst_pl = fcst_pl.to_pandas()
# date features have different dtypes
pd.testing.assert_frame_equal(prep_pl, prep_pd, check_dtype=False)
pd.testing.assert_frame_equal(
    feats_pl.get_features(with_step=True).to_pandas(),
    feats_pd.get_features(with_step=True),
    check_dtype=False,
)
pd.testing.assert_frame_equal(fcst_pl, fcst_pd)

In [None]:
#| hide
# dropped series
for ordered in [True, False]:
    series = generate_daily_series(10, min_length=5, max_length=20)
    if not ordered:
        series = series.sample(frac=1.0, random_state=40)
    ts = TimeSeries(freq='D', lags=[10])
    with warnings.catch_warnings(record=True):
        prep = ts.fit_transform(series, 'unique_id', 'ds', 'y')
    dropped = ts.uids[ts._dropped_series].tolist()
    assert not prep['unique_id'].isin(dropped).any()
    assert set(prep['unique_id'].unique().tolist() + dropped) == set(series['unique_id'].unique())

In [None]:
#| hide
# short series exception
series = generate_daily_series(2, min_length=5, max_length=15)
ts = TimeSeries(freq='D', lags=[1], target_transforms=[Differences([20])])
test_fail(
    lambda: ts.fit_transform(series, 'unique_id', 'ds', 'y'),
    contains="are too short for the 'Differences' transformation"
)

In [None]:
#| hide
# test predict
class Lag1PlusOneModel:
    def predict(self, X):
        return X['lag1'] + 1

ts = TimeSeries(freq='D', lags=[1])
for max_horizon in [None, 2]:
    if max_horizon is None:
        mod1 = Lag1PlusOneModel()
        mod2 = Lag1PlusOneModel()
    else:
        mod1 = [Lag1PlusOneModel() for _ in range(max_horizon)]
        mod2 = [Lag1PlusOneModel() for _ in range(max_horizon)]
    ts.fit_transform(train, 'unique_id', 'ds', 'y', max_horizon=max_horizon)
    # each model gets the correct historic values
    preds = ts.predict(models={'mod1': mod1, 'mod2': mod2}, horizon=2)
    np.testing.assert_allclose(preds['mod1'], preds['mod2'])
    # idempotency
    preds2 = ts.predict(models={'mod1': mod1, 'mod2': mod2}, horizon=2)
    np.testing.assert_allclose(preds2['mod1'], preds2['mod2'])
    pd.testing.assert_frame_equal(preds, preds2)

In [None]:
#| hide
# save & load
series = generate_daily_series(2, n_static_features=2)
ts = TimeSeries(
    freq='D',
    lags=[1, 2],
    date_features=['dayofweek'],
    lag_transforms={
        1: [RollingMean(1)]
    },
    target_transforms=[Differences([20])],
)
ts.fit_transform(series, 'unique_id', 'ds', 'y')
with tempfile.TemporaryDirectory() as tmpdir:
    fname = Path(tmpdir) / 'hi'
    ts.save(fname)
    ts2 = TimeSeries.load(fname)
preds = ts.predict({'model': NaiveModel()}, 10)
preds2 = ts2.predict({'model': NaiveModel()}, 10)
pd.testing.assert_frame_equal(preds, preds2)