In [None]:
#default_exp core

# Core

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
import concurrent.futures
import copy
import inspect
import os
from collections import OrderedDict
from itertools import chain
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import xgboost as xgb
from fastcore.foundation import patch, tuplify
from numba import njit
from window_ops.shift import shift_array

In [None]:
from nbdev import *
from window_ops.expanding import *
from window_ops.ewm import *
from window_ops.rolling import *

from mlforecast.utils import generate_daily_series

## Data format

The required input format is a dataframe with an index named `unique_id` with an unique identifier for each time serie, a column `ds` with the datestamp and a column `y` with the values of the serie. Every other column is considered a static feature unless stated otherwise.

In [None]:
series = generate_daily_series(20, n_static_features=2)
series

For simplicity we'll just take one time serie here.

In [None]:
uids = series.index.unique(level='unique_id')
serie = series.loc[[uids[0]]]
serie

In [None]:
#exporti
date_features_dtypes = {
    'year': np.uint16,
    'month': np.uint8,
    'day': np.uint8,
    'hour': np.uint8,
    'minute': np.uint8,
    'second': np.uint8,
    'dayofyear': np.uint16,
    'day_of_year': np.uint16,
    'weekofyear': np.uint8,
    'week': np.uint8,
    'dayofweek': np.uint8,
    'day_of_week': np.uint8,
    'weekday': np.uint8,
    'quarter': np.uint8,
    'daysinmonth': np.uint8,
    'is_month_start': np.uint8,
    'is_month_end': np.uint8,
    'is_quarter_start': np.uint8,
    'is_quarter_end': np.uint8,
    'is_year_start': np.uint8,
    'is_year_end': np.uint8,
}


@njit
def _append_new(data, indptr, new):
    """Append each value of new to each group in data formed by indptr."""
    n_series = len(indptr) - 1
    new_data = np.empty(data.size + new.size, dtype=data.dtype)
    new_indptr = indptr.copy()
    new_indptr[1:] += np.arange(1, n_series + 1)
    for i in range(n_series):
        new_data[new_indptr[i] : new_indptr[i+1] - 1] = data[indptr[i] : indptr[i + 1]]
        new_data[new_indptr[i+1] - 1] = new[i]
    return new_data, new_indptr

In [None]:
#export
class GroupedArray:
    """Array made up from different groups. Can be thought of as a list of arrays.
    
    All the data is stored in a single 1d array `data`.
    The indices for the group boundaries are stored in another 1d array `indptr`."""
    
    def __init__(self, data: np.ndarray, indptr: np.ndarray):
        self.data = data
        self.indptr = indptr
        self.ngroups = len(indptr) - 1
        
    def __len__(self) -> int:
        return self.ngroups
        
    def __getitem__(self, idx: int) -> np.ndarray:
        return self.data[self.indptr[idx]:self.indptr[idx+1]]
        
    def take_from_groups(self, idx: Union[int, slice]) -> 'GroupedArray':
        """Takes `idx` from each group in the array."""
        ranges = [range(self.indptr[i], self.indptr[i+1])[idx] for i in range(self.ngroups)]
        items = [self.data[rng] for rng in ranges]
        sizes = np.array([item.size for item in items])
        data = np.hstack(items)
        indptr = np.append(0, sizes.cumsum())
        return GroupedArray(data, indptr)
        
    def append(self, new: np.ndarray) -> 'GroupedArray':
        """Appends each element of `new` to each existing group. Returns a copy."""
        if new.size != self.ngroups:
            raise ValueError(f'new must be of size {self.ngroups}')
        new_data, new_indptr = _append_new(self.data, self.indptr, new)
        return GroupedArray(new_data, new_indptr)

    def __repr__(self) -> str:
        return f'GroupedArray(ndata={self.data.size}, ngroups={self.ngroups})'

The `GroupedArray` is used internally for storing the series values and performing transformations.

In [None]:
data = np.arange(10, dtype=np.float32)
indptr = np.array([0, 2, 10])  # group 1: [0, 1], group 2: [2..9]
ga = GroupedArray(data, indptr)

# take the last two observations from every group
last_2 = ga.take_from_groups(slice(-2, None))
np.testing.assert_equal(last_2.data, np.array([0, 1, 8, 9]))
np.testing.assert_equal(last_2.indptr, np.array([0, 2, 4]))

# take the last four observations from every group
last_4 = ga.take_from_groups(slice(-4, None))
np.testing.assert_equal(last_4.data, np.array([0, 1, 6, 7, 8, 9]))
np.testing.assert_equal(last_4.indptr, np.array([0, 2, 6]))

In [None]:
#exporti
@njit
def _identity(x: np.ndarray) -> np.ndarray:
    """Do nothing to the input."""
    return x


@njit(nogil=True)
def _transform_series(data, indptr, updates_only, lag, func, *args) -> np.ndarray:
    """Shifts every group in data by `lag` and computes `func(shifted, *args)` on it."""
    n_series = len(indptr) - 1
    if updates_only:
        out = np.empty_like(data[:n_series])
        for i in range(n_series):
            lagged = shift_array(data[indptr[i]:indptr[i+1]], lag)
            out[i] = func(lagged, *args)[-1]        
    else:
        out = np.empty_like(data)
        for i in range(n_series):
            lagged = shift_array(data[indptr[i]:indptr[i+1]], lag)
            out[indptr[i]:indptr[i+1]] = func(lagged, *args)
    return out


def _build_transform_name(lag, tfm, *args) -> str:
    """Creates a name for a transformation based on `lag`, the name of the function and its arguments."""
    if lag == 0:
        return f'lag-{args[0]}'
    tfm_name = f'{tfm.__name__}_lag-{lag}'
    func_params = list(inspect.signature(tfm).parameters.items())[1:]  # remove input array argument
    changed_params = [f'{name}-{value}' for value, (name, param) in zip(args, func_params) if param.default != value]
    if changed_params:
        tfm_name += '_' + '_'.join(changed_params)
    return tfm_name

In [None]:
#export
class TimeSeries:
    """Utility class for storing and transforming time series data."""
    
    def __init__(self,
                 series_df: pd.DataFrame,
                 freq: str = 'D',
                 lags: List[int] = [],
                 lag_transforms: Dict[int, List[Tuple]] = {},
                 date_features: List[str] = [],
                 static_features: Optional[List[str]] = None,
                 num_threads: Optional[int] = None):
        if not series_df.index.is_monotonic_increasing:
            series_df = series_df.sort_index()
        data = series_df.y.values
        if data.dtype not in (np.float32, np.float64):
            data = data.astype(np.float32)
        sizes = series_df.groupby('unique_id').size().values
        cumsizes = sizes.cumsum()
        indptr = np.append(0, cumsizes)
        self.ga = GroupedArray(data, indptr)
        self.uids = series_df.index.unique(level='unique_id')
        self.last_dates = series_df.index.get_level_values('ds')[cumsizes - 1]
        self.freq = pd.tseries.frequencies.to_offset(freq)
        self.static_features = series_df.iloc[cumsizes - 1].reset_index('ds', drop=True).drop(columns='y')
        if static_features is not None:
            self.static_features = self.static_features[static_features]
        self.num_threads = num_threads or os.cpu_count()
        self.date_features = date_features
        
        self.transforms: Dict[str, Tuple[Any, ...]] = OrderedDict()
        for lag in lags:
            self.transforms[f'lag-{lag}'] = (lag, _identity)
        for lag in lag_transforms.keys():
            for tfm_args in lag_transforms[lag]:
                tfm, *args = tuplify(tfm_args)
                tfm_name = _build_transform_name(lag, tfm, *args)
                self.transforms[tfm_name] = (lag, tfm, *args)
                
        self.y_pred: List[np.ndarray] = []
        self.curr_dates = self.last_dates
        self.test_dates: List[pd.DatetimeIndex] = []

    @property
    def n_series(self):
        return self.ga.ngroups
                
    @property
    def features(self):
        return list(self.transforms.keys()) + self.date_features
                
    def __repr__(self):
        return f'TimeSeries(n_series={self.n_series}, freq={self.freq}, transforms={self.transforms.keys()}, date_features={self.date_features})'
    
    def _apply_transforms(self, updates_only: bool = False):
        results = {}
        offset = 1 if updates_only else 0
        for tfm_name, (lag, tfm, *args) in self.transforms.items():
            results[tfm_name] =  _transform_series(self.ga.data, self.ga.indptr, updates_only, lag - offset, tfm, *args)
        return results

    def _apply_multithreaded_transforms(self, updates_only: bool = False):
        future_to_result = {}
        results = {}
        offset = 1 if updates_only else 0        
        with concurrent.futures.ThreadPoolExecutor(self.num_threads) as executor:
            for tfm_name, (lag, tfm, *args) in self.transforms.items():
                future = executor.submit(_transform_series, self.ga.data, self.ga.indptr, updates_only, lag - offset, tfm, *args)
                future_to_result[future] = tfm_name
            for future in concurrent.futures.as_completed(future_to_result):
                tfm_name = future_to_result[future]
                results[tfm_name] = future.result()
        return results

The `TimeSeries` class has to extract each series values and store it in a contiguous numpy 1d-array. In order to achieve this, the input dataframe must be sorted by `unique_id` and `ds`. To achieve this we set both as indices and make sure they're sorted.

In [None]:
serie = serie.set_index('ds', append=True)
serie.index.is_monotonic_increasing

The `TimeSeries` class takes care of defining the transformations to be performed (`lags`, `lag_transforms` and `date_features`) as well as storing the necessary data to update them. 

In [None]:
lags = [7]
lag_transforms = {
    1: [
        expanding_mean, 
        (rolling_mean, 7)
    ]
}
date_features = ['dayofweek']

ts = TimeSeries(serie, lags=lags, lag_transforms=lag_transforms, date_features=date_features)

test_eq(ts.uids, ['id_00'])
test_eq(ts.last_dates, [serie.index.get_level_values('ds').max()])
test_eq(ts.date_features, date_features)
test_eq(ts.transforms, {'lag-7': (7, _identity), 
                        'expanding_mean_lag-1': (1, expanding_mean), 
                        'rolling_mean_lag-1_window_size-7': (1, rolling_mean, 7)})
test_eq(ts.static_features, serie.tail(1).reset_index('ds', drop=True).drop(columns='y'))

In [None]:
#export
@patch
def compute_transforms(self: TimeSeries) -> Dict[str, np.ndarray]:
    """Compute the transformations defined in the constructor.
    
    If `num_threads > 1` these are computed using multithreading."""
    if self.num_threads == 1 or len(self.transforms) == 1:
        return self._apply_transforms()
    return self._apply_multithreaded_transforms()

After we instantiate a `TimeSeries` class we can call `compute_transforms` to get the values of all the transformations. These are returned in a dictionary where the keys are the name that was assigned the each transformation and the values are the result of the transformation applied to the time serie.

In [None]:
y = serie.y.values
lag_1 = shift_array(y, 1)

for num_threads in (1, 2):
    ts = TimeSeries(serie, lags=lags, lag_transforms=lag_transforms, num_threads=num_threads)
    transforms = ts.compute_transforms()

    np.testing.assert_equal(transforms['lag-7'], shift_array(y, 7))
    np.testing.assert_equal(transforms['expanding_mean_lag-1'], expanding_mean(lag_1))
    np.testing.assert_equal(transforms['rolling_mean_lag-1_window_size-7'], rolling_mean(lag_1, 7))

In [None]:
#export
@patch
def update_y(self: TimeSeries, new: np.ndarray) -> None:
    """Updates the value of y for the predictions and for the features updates."""
    if len(self.y_pred) == 0:
        self.y_pred = []
    self.y_pred.append(new)
    new_arr = np.asarray(new)
    self.ga = self.ga.append(new_arr)

In [None]:
ts = TimeSeries(serie)
max_size = np.diff(ts.ga.indptr)
ts.update_y([1])
ts.update_y([2])

test_eq(np.diff(ts.ga.indptr), max_size + 2)
test_eq(ts.ga.data[-1], 2)

In [None]:
#export
@patch
def update_features(self: TimeSeries) -> pd.DataFrame:
    """Compute the current values of all the features using the latest values of the target."""
    if self.curr_dates.equals(self.last_dates):
        self.curr_dates = self.last_dates.copy()
        self.test_dates = []
    self.curr_dates += self.freq
    self.test_dates.append(self.curr_dates)
    
    if self.num_threads == 1 or len(self.transforms) == 1:
        features = self._apply_transforms(updates_only=True)
    else:
        features = self._apply_multithreaded_transforms(updates_only=True)
    
    for feature in self.date_features:
        feat_vals = getattr(self.curr_dates, feature).values
        features[feature] = feat_vals.astype(date_features_dtypes[feature])
        
    features_df = pd.DataFrame(features, columns=self.features, index=self.uids)
    results_df = self.static_features.join(features_df)
    results_df['ds'] = self.curr_dates
    results_df = results_df.set_index('ds', append=True)
    return results_df

In [None]:
ts = TimeSeries(serie, lags=lags, lag_transforms=lag_transforms, date_features=date_features)
updates = ts.update_features()

# these have an offset becase we can now "see" our last y value
last_date = serie.index.get_level_values('ds').max()
expected_idx = pd.MultiIndex.from_tuples([(ts.uids[0], last_date + ts.freq)], names=['unique_id', 'ds'])
expected = pd.DataFrame({
    'lag-7': shift_array(y, 6)[-1],
    'expanding_mean_lag-1': expanding_mean(y)[-1],
    'rolling_mean_lag-1_window_size-7': rolling_mean(y, 7)[-1],
    'dayofweek': np.uint8([getattr(last_date + pd.tseries.offsets.Day(), 'dayofweek')])},
    index=expected_idx
)
statics = serie.tail(1).drop('y', 1).reset_index('ds', drop=True)
assert updates.equals(statics.join(expected))

test_eq(ts.curr_dates, serie.index.get_level_values('ds')[[-1]] + pd.tseries.offsets.Day())

In [None]:
#export
@patch
def get_predictions(self: TimeSeries) -> pd.DataFrame:
    """Get all the predicted values with their corresponding ids and datestamps."""
    n_preds = len(self.y_pred)
    idx = pd.Index(chain.from_iterable([uid] * n_preds for uid in self.uids), name='unique_id')
    df = pd.DataFrame({
        'ds': np.array(self.test_dates).ravel('F'), 
        'y_pred': np.array(self.y_pred).ravel('F')},
        index=idx)
    return df

In [None]:
#export
def preprocessing_flow(df: pd.DataFrame,
                       freq: str = 'D',
                       lags: List[int] = [],
                       lag_transforms: Dict[int, List[Tuple]] = {},
                       date_features: List[str] = [],
                       dropna: bool = True,
                       keep_last_n: Optional[int] = None,
                       num_threads: Optional[int] = os.cpu_count()) -> Tuple[TimeSeries, pd.DataFrame]:
    """Standard preprocessing flow."""
    df = df.set_index('ds', append=True).sort_index()
    series = TimeSeries(df, freq, lags, lag_transforms, date_features, 
                        num_threads=num_threads)
    df = df.reset_index('ds')
    
    features = series.compute_transforms()  # type: ignore
    for k in series.transforms.keys():
        df[k] = features[k]
  
    if dropna:
        df.dropna(inplace=True)

    for feature in date_features:
        feat_vals = getattr(df.ds.dt, feature).values
        df[feature] = feat_vals.astype(date_features_dtypes[feature])
    
    if keep_last_n is not None:
        series.ga = series.ga.take_from_groups(slice(-keep_last_n, None))

    return series, df

In [None]:
config = dict(
    freq='D',
    lags=[7, 14],
    lag_transforms={
        1: [
            (expanding_mean,),
            (expanding_std,),
        ],
        2: [
            (rolling_mean, 7),
            (rolling_mean, 14),
        ]
    },
    date_features=['dayofweek', 'month', 'year'],
    keep_last_n=15,
    num_threads=2
)
ts, df = preprocessing_flow(series, **config)

expected_lags = ['lag-7', 'lag-14']
expected_transforms = ['expanding_mean_lag-1', 'expanding_std_lag-1', 'rolling_mean_lag-2_window_size-7', 'rolling_mean_lag-2_window_size-14']
expected_date_features = ['dayofweek', 'month', 'year']

test_eq(ts.features, expected_lags + expected_transforms + expected_date_features)
test_eq(ts.static_features.columns.tolist() + ts.features, df.columns.drop(['ds', 'y']).tolist())
# we dropped 2 rows because of the lag 2 and 13 more to have the window of size 14
test_eq(df.shape[0], series.shape[0] - (2 + 13) * ts.n_series)
test_eq(ts.ga.data.size, ts.ga.ngroups * config['keep_last_n'])

In [None]:
#export
def predictions_flow(series: TimeSeries,
                     model,
                     horizon: int) -> pd.DataFrame:
    series = copy.copy(series)
    for _ in range(horizon):
        new_x = series.update_features()  # type: ignore
        if isinstance(model, xgb.Booster):
            new_x = xgb.DMatrix(new_x)
        predictions = model.predict(new_x)
        series.update_y(predictions)  # type: ignore
    return series.get_predictions()  # type: ignore

In [None]:
class DummyModel:
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        return X['lag-7'].values
    
horizon = 7
model = DummyModel()
predictions = predictions_flow(ts, model, horizon)
predictions

In [None]:
grouped_series = series.groupby('unique_id')
expected_preds = grouped_series['y'].tail(7)
expected_dsmin = grouped_series['ds'].max() + ts.freq
expected_dsmax = grouped_series['ds'].max() + horizon * ts.freq

grouped_preds = predictions.groupby('unique_id')
assert predictions['y_pred'].equals(expected_preds)
assert grouped_preds['ds'].min().equals(expected_dsmin)
assert grouped_preds['ds'].max().equals(expected_dsmax)