In [None]:
#default_exp core

# Core

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
import concurrent.futures
import inspect
import os
import warnings
from collections import OrderedDict
from itertools import chain
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from numba import njit
from window_ops.shift import shift_array

from mlforecast.utils import data_indptr_from_sorted_df

In [None]:
from nbdev import show_doc, test_eq, test_fail, test_warns
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import generate_daily_series

## Data format

The required input format is a dataframe with an index named `unique_id` with a unique identifier for each time serie, a column `ds` with the datestamp and a column `y` with the values of the serie. Every other column is considered a static feature unless stated otherwise in `TimeSeries.fit`

In [None]:
series = generate_daily_series(20, n_static_features=2)
series

For simplicity we'll just take one time serie here.

In [None]:
uids = series.index.unique(level='unique_id')
serie = series.loc[[uids[0]]]
serie

In [None]:
#exporti
date_features_dtypes = {
    'year': np.uint16,
    'month': np.uint8,
    'day': np.uint8,
    'hour': np.uint8,
    'minute': np.uint8,
    'second': np.uint8,
    'dayofyear': np.uint16,
    'day_of_year': np.uint16,
    'weekofyear': np.uint8,
    'week': np.uint8,
    'dayofweek': np.uint8,
    'day_of_week': np.uint8,
    'weekday': np.uint8,
    'quarter': np.uint8,
    'daysinmonth': np.uint8,
    'is_month_start': np.uint8,
    'is_month_end': np.uint8,
    'is_quarter_start': np.uint8,
    'is_quarter_end': np.uint8,
    'is_year_start': np.uint8,
    'is_year_end': np.uint8,
}


@njit
def _append_new(data, indptr, new):
    """Append each value of new to each group in data formed by indptr."""
    n_series = len(indptr) - 1
    new_data = np.empty(data.size + new.size, dtype=data.dtype)
    new_indptr = indptr.copy()
    new_indptr[1:] += np.arange(1, n_series + 1)
    for i in range(n_series):
        prev_slice = slice(indptr[i], indptr[i + 1])
        new_slice = slice(new_indptr[i], new_indptr[i + 1] - 1)
        new_data[new_slice] = data[prev_slice]
        new_data[new_indptr[i + 1] - 1] = new[i]
    return new_data, new_indptr


In [None]:
#exporti
class GroupedArray:
    """Array made up of different groups. Can be thought of (and iterated) as a list of arrays.
    
    All the data is stored in a single 1d array `data`.
    The indices for the group boundaries are stored in another 1d array `indptr`."""
    
    def __init__(self, data: np.ndarray, indptr: np.ndarray):
        self.data = data
        self.indptr = indptr
        self.ngroups = len(indptr) - 1
        
    def __len__(self) -> int:
        return self.ngroups
        
    def __getitem__(self, idx: int) -> np.ndarray:
        return self.data[self.indptr[idx] : self.indptr[idx + 1]]
        
    def take_from_groups(self, idx: Union[int, slice]) -> 'GroupedArray':
        """Takes `idx` from each group in the array."""
        ranges = [
            range(self.indptr[i], self.indptr[i + 1])[idx] for i in range(self.ngroups)
        ]
        items = [self.data[rng] for rng in ranges]
        sizes = np.array([item.size for item in items])
        data = np.hstack(items)
        indptr = np.append(0, sizes.cumsum())
        return GroupedArray(data, indptr)
        
    def append(self, new: np.ndarray) -> 'GroupedArray':
        """Appends each element of `new` to each existing group. Returns a copy."""
        if new.size != self.ngroups:
            raise ValueError(f'new must be of size {self.ngroups}')
        new_data, new_indptr = _append_new(self.data, self.indptr, new)
        return GroupedArray(new_data, new_indptr)

    def __repr__(self) -> str:
        return f'GroupedArray(ndata={self.data.size}, ngroups={self.ngroups})'


In [None]:
#hide
# The `GroupedArray` is used internally for storing the series values and performing transformations.
data = np.arange(10, dtype=np.float32)
indptr = np.array([0, 2, 10])  # group 1: [0, 1], group 2: [2..9]
ga = GroupedArray(data, indptr)

In [None]:
#hide
# Iterate through the groups
ga_iter = iter(ga)
np.testing.assert_equal(next(ga_iter), np.array([0, 1]))
np.testing.assert_equal(next(ga_iter), np.arange(2, 10))

In [None]:
# hide
# Take the last two observations from every group
last_2 = ga.take_from_groups(slice(-2, None))
np.testing.assert_equal(last_2.data, np.array([0, 1, 8, 9]))
np.testing.assert_equal(last_2.indptr, np.array([0, 2, 4]))

In [None]:
#hide
# Take the last four observations from every group. Note that since group 1 only has two elements, only these are returned.
last_4 = ga.take_from_groups(slice(-4, None))
np.testing.assert_equal(last_4.data, np.array([0, 1, 6, 7, 8, 9]))
np.testing.assert_equal(last_4.indptr, np.array([0, 2, 6]))

In [None]:
#exporti
@njit
def _identity(x: np.ndarray) -> np.ndarray:
    """Do nothing to the input."""
    return x


def _as_tuple(x):
    """Return a tuple from the input."""
    if isinstance(x, tuple):
        return x
    return (x,)


@njit(nogil=True)
def _transform_series(data, indptr, updates_only, lag, func, *args) -> np.ndarray:
    """Shifts every group in `data` by `lag` and computes `func(shifted, *args)`.
    
    If `updates_only=True` only last value of the transformation for each group is returned, 
    otherwise the full transformation is returned"""
    n_series = len(indptr) - 1
    if updates_only:
        out = np.empty_like(data[:n_series])
        for i in range(n_series):
            lagged = shift_array(data[indptr[i] : indptr[i + 1]], lag)
            out[i] = func(lagged, *args)[-1]        
    else:
        out = np.empty_like(data)
        for i in range(n_series):
            lagged = shift_array(data[indptr[i] : indptr[i + 1]], lag)
            out[indptr[i] : indptr[i + 1]] = func(lagged, *args)
    return out


In [None]:
#exporti
def _build_transform_name(lag, tfm, *args) -> str:
    """Creates a name for a transformation based on `lag`, the name of the function and its arguments."""
    tfm_name = f'{tfm.__name__}_lag-{lag}'
    func_params = inspect.signature(tfm).parameters
    func_args = list(func_params.items())[1:]  # remove input array argument
    changed_params = [
        f'{name}-{value}'
        for value, (name, arg) in zip(args, func_args)
        if arg.default != value
    ]
    if changed_params:
        tfm_name += '_' + '_'.join(changed_params)
    return tfm_name


In [None]:
#hide
test_eq(_build_transform_name(1, expanding_mean), 'expanding_mean_lag-1')
test_eq(_build_transform_name(2, rolling_mean, 7), 'rolling_mean_lag-2_window_size-7')

In [None]:
#export
def simple_predict(model, new_x: pd.DataFrame, *args) -> np.ndarray:
    """Drop the ds column from `new_x` and call `model.predict` on it."""
    # this avoids installing xgboost only to check if the model is instance of xgb.Booster
    model_is_xgb_booster = (
        type(model).__module__ == 'xgboost.core' and type(model).__name__ == 'Booster'
    )    
    new_x = new_x.drop(columns='ds')
    if model_is_xgb_booster:
        import xgboost as xgb
        
        new_x = xgb.DMatrix(new_x)
    return model.predict(new_x)


In [None]:
#export
class TimeSeries:
    """Utility class for storing and transforming time series data."""
    
    def __init__(
        self,
        freq: str = 'D',
        lags: List[int] = [],
        lag_transforms: Dict[int, List[Tuple]] = {},
        date_features: List[str] = [],
        num_threads: Optional[int] = None,
    ):
        self.freq = pd.tseries.frequencies.to_offset(freq)
        self.num_threads = num_threads or os.cpu_count()
        self.date_features = date_features
        
        self.transforms: Dict[str, Tuple[Any, ...]] = OrderedDict()
        for lag in lags:
            self.transforms[f'lag-{lag}'] = (lag, _identity)
        for lag in lag_transforms.keys():
            for tfm_args in lag_transforms[lag]:
                tfm, *args = _as_tuple(tfm_args)
                tfm_name = _build_transform_name(lag, tfm, *args)
                self.transforms[tfm_name] = (lag, tfm, *args)
                
        self.ga: GroupedArray
                
    @property
    def features(self) -> List[str]:
        """Names of all computed features."""
        return list(self.transforms.keys()) + self.date_features
                
    def __repr__(self):
        return f'TimeSeries(freq={self.freq}, transforms={list(self.transforms.keys())}, date_features={self.date_features}, num_threads={self.num_threads})'
    
    def _apply_transforms(self, updates_only: bool = False) -> Dict[str, np.ndarray]:
        """Apply the transformations using the main process.
        
        If `updates_only` then only the updates are returned.
        """
        results = {}
        offset = 1 if updates_only else 0
        for tfm_name, (lag, tfm, *args) in self.transforms.items():
            results[tfm_name] = _transform_series(
                self.ga.data, self.ga.indptr, updates_only, lag - offset, tfm, *args
            )
        return results

    def _apply_multithreaded_transforms(
        self, updates_only: bool = False
    ) -> Dict[str, np.ndarray]:
        """Apply the transformations using multithreading.
        
        If `updates_only` then only the updates are returned.
        """        
        future_to_result = {}
        results = {}
        offset = 1 if updates_only else 0        
        with concurrent.futures.ThreadPoolExecutor(self.num_threads) as executor:
            for tfm_name, (lag, tfm, *args) in self.transforms.items():
                future = executor.submit(
                    _transform_series,
                    self.ga.data,
                    self.ga.indptr,
                    updates_only,
                    lag - offset,
                    tfm,
                    *args,
                )
                future_to_result[future] = tfm_name
            for future in concurrent.futures.as_completed(future_to_result):
                tfm_name = future_to_result[future]
                results[tfm_name] = future.result()
        return results
    
    def _compute_transforms(self) -> Dict[str, np.ndarray]:
        """Compute the transformations defined in the constructor.

        If `self.num_threads > 1` these are computed using multithreading."""
        if self.num_threads == 1 or len(self.transforms) == 1:
            return self._apply_transforms()
        return self._apply_multithreaded_transforms()
    
    def _update_y(self, new: np.ndarray) -> None:
        """Appends the elements of `new` to every time serie.

        These values are used to update the transformations and are stored as predictions."""
        if not hasattr(self, 'y_pred'):
            self.y_pred = []
        self.y_pred.append(new)
        new_arr = np.asarray(new)
        self.ga = self.ga.append(new_arr)     
        
    def _update_features(self) -> pd.DataFrame:
        """Compute the current values of all the features using the latest values of the time series."""
        if not hasattr(self, 'curr_dates'):
            self.curr_dates = self.last_dates.copy()
            self.test_dates = []
        self.curr_dates += self.freq
        self.test_dates.append(self.curr_dates)

        if self.num_threads == 1 or len(self.transforms) == 1:
            features = self._apply_transforms(updates_only=True)
        else:
            features = self._apply_multithreaded_transforms(updates_only=True)

        for feature in self.date_features:
            feat_vals = getattr(self.curr_dates, feature).values
            features[feature] = feat_vals.astype(date_features_dtypes[feature])

        features_df = pd.DataFrame(features, columns=self.features, index=self.uids)
        nulls_in_cols = features_df.isnull().any()
        if any(nulls_in_cols):
            warnings.warn(
                f'Found null values in {", ".join(nulls_in_cols[nulls_in_cols].index)}.'
            )
        results_df = self.static_features.join(features_df)
        results_df['ds'] = self.curr_dates
        return results_df
    
    def _get_predictions(self) -> pd.DataFrame:
        """Get all the predicted values with their corresponding ids and datestamps."""
        n_preds = len(self.y_pred)
        idx = pd.Index(
            chain.from_iterable([uid] * n_preds for uid in self.uids),
            name='unique_id',
            dtype=self.uids.dtype,
        )
        df = pd.DataFrame(
            {
                'ds': np.array(self.test_dates).ravel('F'), 
                'y_pred': np.array(self.y_pred).ravel('F'),
            },
            index=idx,
        )
        return df
    
    def _fit(
        self,
        df: pd.DataFrame,
        static_features: Optional[List[str]] = None,
    ) -> 'TimeSeries':
        """Save the series values, ids, last dates and static features."""
        data, indptr = data_indptr_from_sorted_df(df)
        if data.dtype not in (np.float32, np.float64):
            # since all transformations generate nulls, we need a float dtype
            data = data.astype(np.float32)
        self.ga = GroupedArray(data, indptr)
        self.uids = df.index.unique(level='unique_id')
        last_obs = df.iloc[indptr[1:] - 1]
        self.last_dates = last_obs.index.get_level_values('ds')
        self.static_features = last_obs.reset_index('ds').drop(columns=['ds', 'y'])
        if static_features is not None:
            self.static_features = self.static_features[static_features]
        return self

    def _transform(
        self,
        df: pd.DataFrame,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> pd.DataFrame:
        """Add the features to `df`.
        
        if `dropna=True` then all the null rows are dropped.
        if `keep_last_n` is not None then that number of observations is kept across all series."""
        df = df.reset_index('ds')
        features = self._compute_transforms()
        for feat in self.transforms.keys():
            df[feat] = features[feat]

        if dropna:
            df.dropna(inplace=True)

        for feature in self.date_features:
            feat_vals = getattr(df.ds.dt, feature).values
            df[feature] = feat_vals.astype(date_features_dtypes[feature])

        if keep_last_n is not None:
            self.ga = self.ga.take_from_groups(slice(-keep_last_n, None))
            
        self._ga = GroupedArray(self.ga.data, self.ga.indptr)
        self.features_order_ = df.columns.drop(['ds', 'y'])
        return df

    def fit_transform(
        self,
        data: pd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None, 
    ) -> pd.DataFrame:
        """Add the features to `data` and save the required information for the predictions step.
        
        If not all features are static, specif which ones are in `static_features`.
        If you don't want to drop rows with null values after the transformations set `dropna=False`.
        If you want to keep only the last `n` values of each time serie set `keep_last_n=n`.
        """
        if data['y'].isnull().any():
            raise ValueError('y column contains null values.')
        data = data.set_index('ds', append=True)
        if not data.index.is_monotonic_increasing:
            data = data.sort_index()
        self._fit(data, static_features)
        return self._transform(data, dropna, keep_last_n)
    
    def _predict_setup(self) -> None:
        self.curr_dates = self.last_dates.copy()
        self.test_dates = []
        self.y_pred = []
        self.ga = GroupedArray(self._ga.data, self._ga.indptr)
    
    def predict(
        self,
        model,
        horizon: int,
        predict_fn: Callable = simple_predict,
        **predict_fn_kwargs,
    ) -> pd.DataFrame:
        """Use `model` to predict the next `horizon` timesteps.
        
        `predict_fn(model, new_x, features_order, **predict_fn_kwargs)` is called on each timestep."""
        self._predict_setup()
        for _ in range(horizon):
            new_x = self._update_features()
            predictions = predict_fn(
                model, new_x, self.features_order_, **predict_fn_kwargs
            )
            self._update_y(predictions)
        return self._get_predictions()


The `TimeSeries` class takes care of defining the transformations to be performed (`lags`, `lag_transforms` and `date_features`). The transformations can be computed using multithreading if `num_threads > 1`. If `num_threads=None` then it is set to the number of cpus using `os.cpu_count()`.

In [None]:
flow_config = dict(
    freq='W-THU',
    lags=[7],
    lag_transforms={
        1: [expanding_mean, (rolling_mean, 7)]
    },
    date_features=['dayofweek']
)

ts = TimeSeries(**flow_config)
ts

The frequency is converted to an offset.

In [None]:
test_eq(ts.freq, pd.tseries.frequencies.to_offset(flow_config['freq']))

The date features are stored as they were passed to the constructor.

In [None]:
test_eq(ts.date_features, flow_config['date_features'])

The transformations are stored as a dictionary where the key is the name of the transformation (name of the column in the dataframe with the computed features), which is built using `build_transform_name` and the value is a tuple where the first element is the lag it is applied to, then the function and then the function arguments.

In [None]:
test_eq(
    ts.transforms, 
    {
        'lag-7': (7, _identity),
        'expanding_mean_lag-1': (1, expanding_mean), 
        'rolling_mean_lag-1_window_size-7': (1, rolling_mean, 7)
        
    }
)

Note that for `lags` we define the transformation as the identity function applied to its corresponding lag. This is because `_transform_series` takes the lag as an argument and shifts the array before computing the transformation.

In [None]:
#hide
serie = serie.set_index('ds', append=True)

In [None]:
#hide
# _compute_transforms
y = serie.y.values
lag_1 = shift_array(y, 1)

for num_threads in (1, 2):
    ts = TimeSeries(**flow_config)
    ts._fit(serie)
    transforms = ts._compute_transforms()

    np.testing.assert_equal(transforms['lag-7'], shift_array(y, 7))
    np.testing.assert_equal(transforms['expanding_mean_lag-1'], expanding_mean(lag_1))
    np.testing.assert_equal(transforms['rolling_mean_lag-1_window_size-7'], rolling_mean(lag_1, 7))

In [None]:
#hide
# update_y
ts = TimeSeries()
ts._fit(serie)
max_size = np.diff(ts.ga.indptr)
ts._update_y([1])
ts._update_y([2])

test_eq(np.diff(ts.ga.indptr), max_size + 2)
test_eq(ts.ga.data[-2:], [1, 2])

In [None]:
#hide
# _update_features
ts = TimeSeries(**flow_config)
ts._fit(serie)
updates = ts._update_features().drop(columns='ds')

last_date = serie.index.get_level_values('ds').max()
first_prediction_date = last_date + ts.freq
expected_idx = pd.Index(ts.uids, name='unique_id')

# these have an offset becase we can now "see" our last y value
expected = pd.DataFrame({
    'lag-7': shift_array(y, 6)[-1],
    'expanding_mean_lag-1': expanding_mean(y)[-1],
    'rolling_mean_lag-1_window_size-7': rolling_mean(y, 7)[-1],
    'dayofweek': np.uint8([getattr(first_prediction_date, 'dayofweek')])},
    index=expected_idx
)
statics = serie.tail(1).reset_index('ds').drop(columns=['ds', 'y'])
assert updates.equals(statics.join(expected))

test_eq(ts.curr_dates[0], first_prediction_date)

In [None]:
#hide
# _get_predictions
ts = TimeSeries()
ts._fit(serie)
ts._update_features()
ts._update_y([1.])
preds = ts._get_predictions()

last_ds = serie.index.get_level_values('ds').max()
expected_idx = serie.index.get_level_values('unique_id')[[0]]
expected = pd.DataFrame({'ds': [last_ds + ts.freq], 'y_pred': [1.]},
                        index=expected_idx)
assert preds.equals(expected)

In [None]:
#hide
serie = serie.reset_index('ds')

In [None]:
show_doc(TimeSeries.fit_transform)

In [None]:
flow_config = dict(
    freq='D',
    lags=[7, 14],
    lag_transforms={
        1: [expanding_mean],
        2: [
            (rolling_mean, 7),
            (rolling_mean, 14),
        ]
    },
    date_features=['dayofweek', 'month', 'year'],
    num_threads=2
)

ts = TimeSeries(**flow_config)
_ = ts.fit_transform(series)

The series values are stored as a GroupedArray in an attribute `ga`. If the data type of the series values is an int then it is converted to `np.float32`, this is because lags generate `np.nan`s so we need a float data type for them.

In [None]:
np.testing.assert_equal(ts.ga.data, series.y.values)

The series ids are stored in an `uids` attribute.

In [None]:
test_eq(ts.uids, series.index.unique())

For each time serie, the last observed date is stored so that predictions start from the last date + the frequency.

In [None]:
test_eq(ts.last_dates, series.groupby('unique_id')['ds'].max().values)

The last row of every serie without the `y` and `ds` columns are taken as static features.

In [None]:
assert ts.static_features.equals(series.groupby('unique_id').tail(1).drop(columns=['ds', 'y']))

If you pass `static_features` to `TimeSeries.fit_transform` then only these are kept.

In [None]:
ts.fit_transform(series, static_features=['static_0'])

assert ts.static_features.equals(series.groupby('unique_id').tail(1)[['static_0']])

You can also specify `keep_last_n` in `TimeSeries.fit_transform`, which means that after computing the features for training we want to keep only the last `n` samples of each time serie for computing the updates. This saves both memory and time, since the updates are performed by running the transformation functions on all time series again and keeping only the last value (the update).

If you have very long time series and your updates only require a small sample it's recommended that you set `keep_last_n` to the minimum number of samples required to compute the updates, which in this case is 15 since we have a rolling mean of size 14 over the lag 2 and in the first update the lag 2 becomes the lag 1. This is because in the first update the lag 1 is the last value of the series (or the lag 0), the lag 2 is the lag 1 and so on.

In [None]:
keep_last_n = 15

ts = TimeSeries(**flow_config)
df = ts.fit_transform(series, keep_last_n=keep_last_n)

expected_lags = ['lag-7', 'lag-14']
expected_transforms = ['expanding_mean_lag-1', 
                       'rolling_mean_lag-2_window_size-7', 
                       'rolling_mean_lag-2_window_size-14']
expected_date_features = ['dayofweek', 'month', 'year']

test_eq(ts.features, expected_lags + expected_transforms + expected_date_features)
test_eq(ts.static_features.columns.tolist() + ts.features, df.columns.drop(['ds', 'y']).tolist())
# we dropped 2 rows because of the lag 2 and 13 more to have the window of size 14
test_eq(df.shape[0], series.shape[0] - (2 + 13) * ts.ga.ngroups)
test_eq(ts.ga.data.size, ts.ga.ngroups * keep_last_n)

If you get the minimum number of samples that your transformation needs wrong i.e. you generate `np.nan`s in the update, you'll get a warning.

In [None]:
ts = TimeSeries(lag_transforms={2: [(rolling_mean, 14)]})
_ = ts.fit_transform(series, keep_last_n=14)
test_warns(lambda: ts._update_features(), show=True)

As a rule of thumb the minimum number of samples you need is max(lag + window_size) - 1 for rolling operations. **If you have expanding features you need all samples to get the correct value**, this won't raise a warning because there won't be null values, however the feature value will be wrong.

`TimeSeries.fit_transform` requires that the *y* column doesn't have any null values. This is because the transformations could propagate them forward, so if you have null values in the *y* column you'll get an error.

In [None]:
series_with_nulls = series.copy()
series_with_nulls.iloc[1, 1] = np.nan
test_fail(lambda: ts.fit_transform(series_with_nulls), contains='y column contains null values')

In [None]:
show_doc(TimeSeries.predict)

Once we have a trained model we can use `TimeSeries.predict` passing the model and the horizon to get the predictions back.

In [None]:
class DummyModel:
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        return X['lag-7'].values

horizon = 7
model = DummyModel()
ts = TimeSeries(**flow_config)
ts.fit_transform(series)
predictions = ts.predict(model, horizon)

grouped_series = series.groupby('unique_id')
expected_preds = grouped_series['y'].tail(7)  # the model predicts the lag-7
last_dates = grouped_series['ds'].max()
expected_dsmin = last_dates + ts.freq
expected_dsmax = last_dates + horizon * ts.freq
grouped_preds = predictions.groupby('unique_id')

assert predictions['y_pred'].equals(expected_preds)
assert grouped_preds['ds'].min().equals(expected_dsmin)
assert grouped_preds['ds'].max().equals(expected_dsmax)