In [None]:
#|default_exp forecast

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

# Forecast

> Full pipeline encapsulation

In [None]:
#|export
import warnings
from typing import Callable, Iterable, List, Optional, Union

import numpy as np
import pandas as pd
from sklearn.base import clone

from mlforecast.core import (
    DateFeature,
    Differences,
    Freq,
    LagTransforms,
    Lags,
    Models,
    TimeSeries,
)
from mlforecast.utils import backtest_splits

In [None]:
#|hide
from fastcore.test import test_warns
from nbdev import show_doc
from sklearn import set_config

In [None]:
#|hide
set_config(display='text')
warnings.simplefilter('ignore', UserWarning)

In [None]:
#|export
class MLForecast:
    def __init__(
        self,
        models: Models,
        freq: Optional[Freq] = None,
        lags: Optional[Lags] = None,
        lag_transforms: Optional[LagTransforms] = None,
        date_features: Optional[Iterable[DateFeature]] = None,
        differences: Optional[Differences] = None,
        num_threads: int = 1,
    ):
        """Create forecast object

        Parameters
        ----------
        models : regressor or list of regressors
            Models that will be trained and used to compute the forecasts.
        freq : str or int, optional (default=None)
            Pandas offset alias, e.g. 'D', 'W-THU' or integer denoting the frequency of the series.
        lags : list of int, optional (default=None)
            Lags of the target to use as features.
        lag_transforms : dict of int to list of functions, optional (default=None)
            Mapping of target lags to their transformations.
        date_features : list of str or callable, optional (default=None)
            Features computed from the dates. Can be pandas date attributes or functions that will take the dates as input.
        differences : list of int, optional (default=None)
            Differences to take of the target before computing the features. These are restored at the forecasting step.
        num_threads : int (default=1)
            Number of threads to use when computing the features.
        """        
        if not isinstance(models, list):
            models = [clone(models)]
        self.models = [clone(m) for m in models]
        self.ts = TimeSeries(freq, lags, lag_transforms, date_features, differences, num_threads)
        
    def __repr__(self):
        return (
            f'{self.__class__.__name__}(models=[{", ".join(m.__class__.__name__ for m in self.models)}], '
            f'freq={self.freq}, '
            f'lag_features={list(self.ts.transforms.keys())}, '
            f'date_features={self.ts.date_features}, '
            f'num_threads={self.ts.num_threads})'
        )
    
    @property
    def freq(self):
        return self.ts.freq

    def preprocess(
        self,
        data: pd.DataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> pd.DataFrame:
        """Add the features to `data`.
        
        Parameters
        ----------
        data : pandas DataFrame
            Series data in long format.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.

        Returns
        -------
        result : pandas DataFrame.
            `data` plus added features.
        """
        return self.ts.fit_transform(data, id_col, time_col, target_col, static_features, dropna, keep_last_n)
    
    def fit_models(
        self,
        X: pd.DataFrame,
        y: Union[np.ndarray, pd.Series],
    ) -> 'Forecast':
        """Manually train models. Use this if you called `Forecast.preprocess` beforehand.
        
        Parameters
        ----------
        X : pandas DataFrame
            Features.
        y : numpy array or pandas Series.
            Target.
            
        Returns
        -------
        self : Forecast
            Forecast object with trained models.
        """
        self.models_ = []
        for model in self.models:
            self.models_.append(clone(model).fit(X, y))
        return self

    def fit(
        self,
        data: pd.DataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> 'Forecast':
        """Apply the feature engineering and train the models.
        
        Parameters
        ----------
        data : pandas DataFrame
            Series data in long format.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.

        Returns
        -------
        self : Forecast
            Forecast object with series values and trained models.
        """
        series_df = self.preprocess(data, id_col, time_col, target_col, static_features, dropna, keep_last_n)
        X, y = series_df.drop(columns=[time_col, target_col]), series_df[target_col].values
        del series_df
        return self.fit_models(X, y)

    def predict(
        self,
        horizon: int,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        predict_fn: Optional[Callable] = None,
        **predict_fn_kwargs,
    ) -> pd.DataFrame:
        """Compute the predictions for the next `horizon` steps.
        
        Parameters
        ----------
        horizon : int
            Number of periods to predict.
        dynamic_dfs : list of pandas DataFrame, optional (default=None)
            Future values of the dynamic features, e.g. prices.
        predict_fn : callable, optional (default=None)
            Custom function to compute predictions.
            This function will recieve: model, new_x, dynamic_dfs, features_order and kwargs,
            and should return an array with the predictions, where:
                model : regressor
                    Fitted model.
                new_x : pandas DataFrame
                    Current values of the features.
                dynamic_dfs : list of pandas DataFrame
                    Future values of the dynamic features
                features_order : list of str
                    Column names in the order in which they were used to train the model.
                **kwargs
                    Other keyword arguments passed to `Forecast.predict`.
        **predict_fn_kwargs
            Additional arguments passed to predict_fn
                    
        Returns
        -------
        result : pandas DataFrame
            Predictions for each serie and timestep, with one column per model.
        """
        if not hasattr(self, 'models_'):
            raise ValueError('No fitted models found. You have to call fit or preprocess + fit_models.')
        return self.ts.predict(
            self.models_, horizon, dynamic_dfs, predict_fn, **predict_fn_kwargs
        )

    def cross_validation(
        self,
        data: pd.DataFrame,
        n_windows: int,
        window_size: int,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        predict_fn: Optional[Callable] = None,
        **predict_fn_kwargs,
    ):
        """Perform time series cross validation.
        Creates `n_windows` splits where each window has `window_size` test periods, 
        trains the models, computes the predictions and merges the actuals.
        
        Parameters
        ----------
        data : pandas DataFrame
            Series data in long format.
        n_windows : int
            Number of windows to evaluate.
        window_size : int
            Number of test periods in each window.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.
        dynamic_dfs : list of pandas DataFrame, optional (default=None)
            Future values of the dynamic features, e.g. prices.
        predict_fn : callable, optional (default=None)
            Custom function to compute predictions.
            This function will recieve: model, new_x, dynamic_dfs, features_order and kwargs,
            and should return an array with the predictions, where:
                model : regressor
                    Fitted model.
                new_x : pandas DataFrame
                    Current values of the features.
                dynamic_dfs : list of pandas DataFrame
                    Future values of the dynamic features
                features_order : list of str
                    Column names in the order in which they were used to train the model.
                **kwargs
                    Other keyword arguments passed to `Forecast.predict`.
        **predict_fn_kwargs
            Additional arguments passed to predict_fn                    

        Returns
        -------
        result : pandas DataFrame
            Predictions for each window with the series id, timestamp, last train date, target value and predictions from each model.
        """
        results = []
        self.cv_models_ = []
        if id_col != 'index':
            data = data.set_index(id_col)
        
        if np.issubdtype(data[time_col].dtype.type, np.integer):
            freq = 1
        else:
            freq = self.freq

        for train_end, train, valid in backtest_splits(data, n_windows, window_size, freq, time_col):
            self.fit(train, 'index', time_col, target_col, static_features, dropna, keep_last_n)
            self.cv_models_.append(self.models_)
            y_pred = self.predict(
                window_size, dynamic_dfs, predict_fn, **predict_fn_kwargs
            )
            y_pred = y_pred.set_index(time_col, append=True)
            result = valid.set_index(time_col, append=True)[[target_col]].copy()
            result = result.join(y_pred).reset_index(time_col)
            result['cutoff'] = train_end            
            results.append(result)

        out = pd.concat(results)
        out = out[[time_col, 'cutoff', target_col, *y_pred.columns]]
        if id_col != 'index':
            out = out.reset_index()
        return out

In [None]:
show_doc(MLForecast)

---

### MLForecast

>      MLForecast (models:Union[sklearn.base.BaseEstimator,List[sklearn.base.Bas
>                  eEstimator]], freq:Union[int,str,NoneType]=None,
>                  lags:Optional[Iterable[int]]=None, lag_transforms:Optional[Di
>                  ct[int,List[Union[Callable,Tuple[Callable,Any]]]]]=None,
>                  date_features:Optional[Iterable[Union[str,Callable]]]=None,
>                  differences:Optional[Iterable[int]]=None, num_threads:int=1)

Create forecast object

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| models | typing.Union[sklearn.base.BaseEstimator, typing.List[sklearn.base.BaseEstimator]] |  | Models that will be trained and used to compute the forecasts. |
| freq | typing.Union[int, str, NoneType] | None | Pandas offset alias, e.g. 'D', 'W-THU' or integer denoting the frequency of the series. |
| lags | typing.Optional[typing.Iterable[int]] | None | Lags of the target to use as features. |
| lag_transforms | typing.Optional[typing.Dict[int, typing.List[typing.Union[typing.Callable, typing.Tuple[typing.Callable, typing.Any]]]]] | None | Mapping of target lags to their transformations. |
| date_features | typing.Optional[typing.Iterable[typing.Union[str, typing.Callable]]] | None | Features computed from the dates. Can be pandas date attributes or functions that will take the dates as input. |
| differences | typing.Optional[typing.Iterable[int]] | None | Differences to take of the target before computing the features. These are restored at the forecasting step. |
| num_threads | int | 1 | Number of threads to use when computing the features. |

The `MLForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing the predictions). It tries to mimic the scikit-learn API.

In [None]:
#| export
class Forecast(MLForecast):
    def __init__(
        self,
        models: Models,
        freq: Optional[Freq] = None,
        lags: Optional[Lags] = None,
        lag_transforms: Optional[LagTransforms] = None,
        date_features: Optional[Iterable[DateFeature]] = None,
        differences: Optional[Differences] = None,
        num_threads: int = 1,
    ):
        warning_msg = (
            'The Forecast class is deprecated and will be removed in a future version, '
            'please use the MLForecast class instead.'
        )
        warnings.warn(warning_msg, DeprecationWarning)
        super().__init__(models, freq, lags, lag_transforms, date_features, differences, num_threads)

In [None]:
#| hide
test_warns(lambda: Forecast([]))

## Example
This shows an example with just 4 series of the M4 dataset. If you want to run it yourself on all of them, you can refer to [this notebook](https://www.kaggle.com/code/lemuz90/m4-competition).

In [None]:
import random

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from datasetsforecast.m4 import M4, M4Info
from sklearn.metrics import mean_squared_error
from window_ops.ewm import ewm_mean
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import generate_daily_series, generate_prices_for_series

In [None]:
group = 'Hourly'
await M4.async_download('data', group=group)
df, *_ = M4.load(directory='data', group=group)
df['ds'] = df['ds'].astype('int')
ids = df['unique_id'].unique()
random.seed(0)
sample_ids = random.choices(ids, k=4)
sample_df = df[df['unique_id'].isin(sample_ids)]
sample_df

We now split this data into train and validation.

In [None]:
info = M4Info[group]
horizon = info.horizon
valid = sample_df.groupby('unique_id').tail(horizon)
train = sample_df.drop(valid.index)
train.shape, valid.shape

### Creating the Forecast object

The forecast object encapsulates the feature engineering + training the models + forecasting. When we initialize it we define:

* The models we want to train
* The series frequency. This is added to the last dates seen in train for the forecast step, if the time column contains integer values we can leave it empty or set it to 1.
* The feature engineering:
    * Lags to use as features
    * Transformations on the lags
    * Date features
    * Differences to apply to the target before computing the features, which are then restored when forecasting.
* Number of threads to use when computing the features.

In [None]:
fcst = MLForecast(
    models=lgb.LGBMRegressor(random_state=0),
    differences=[24],
    lags=[24 * (i+1) for i in range(7)],
    lag_transforms={
        48: [(ewm_mean, 0.3)],
    },
    num_threads=1,
)
fcst

Once we have this setup we can compute the features and fit the model.

In [None]:
show_doc(MLForecast.fit)

In [None]:
fcst.fit(train, id_col='unique_id', time_col='ds', target_col='y');

Once we've run this we're ready to compute our predictions.

In [None]:
show_doc(MLForecast.predict)

In [None]:
predictions = fcst.predict(horizon)

We can see at a couple of results.

In [None]:
results = valid.merge(predictions, on=['unique_id', 'ds']).set_index('unique_id')
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 10))
for uid, axi in zip(sample_ids, ax.flat):
    results.loc[uid].set_index('ds').plot(ax=axi, title=uid)

If you want to take a look at the data that will be used to train the models you can call `Forecast.preprocess`.

In [None]:
show_doc(MLForecast.preprocess)

In [None]:
prep_df = fcst.preprocess(train, id_col='unique_id', time_col='ds', target_col='y')
prep_df

If we do this we then have to call `Forecast.fit_models`, since this only stores the series information.

In [None]:
show_doc(MLForecast.fit_models)

In [None]:
X, y = prep_df.drop(columns=['ds', 'y']), prep_df['y']
fcst.fit_models(X, y)

In [None]:
predictions2 = fcst.predict(horizon)
pd.testing.assert_frame_equal(predictions, predictions2)

### Cross validation

If we would like to know how good our forecast will be for a specific model and set of features then we can perform cross validation. What cross validation does is take our data and split it in two parts, where the first part is used for training and the second one for validation. Since the data is time dependant we usually take the last *x* observations from our data as the validation set.

This process is implemented in `Forecast.cross_validation`, which takes our data and performs the process described above for `n_windows` times where each window has `window_size` validation samples in it. For example, if we have 100 samples and we want to perform 2 backtests each of size 14, the splits will be as follows:

1. Train: 1 to 72. Validation: 73 to 86.
2. Train: 1 to 86. Validation: 87 to 100.

In [None]:
show_doc(MLForecast.cross_validation)

In [None]:
cv_results = fcst.cross_validation(
    train,
    n_windows=4,
    window_size=horizon,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
)
cv_results

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(16, 10))

for uid, axi in zip(sample_ids, ax.flat):
    subset = cv_results[cv_results['unique_id'].eq(uid)].drop(columns=['unique_id', 'cutoff'])
    subset.set_index('ds').plot(ax=axi, title=uid)

### Dynamic features

We're going to use a synthetic dataset from this point onwards to demonstrate some other functionalities regarding external regressors.

In [None]:
series = generate_daily_series(100, equal_ends=True, n_static_features=2, static_as_categorical=False)
series

As we saw in the previous example, the required columns are the series identifier, time and target. Whatever extra columns you have, like `static_0` and `static_1` here are considered to be static and are replicated when constructing the features for the next timestamp. You can disable this by passing `static_features` to `MLForecast.preprocess` or `MLForecast.fit` , which will only keep the columns you define there as static. Keep in mind that they will still be used for training, so you'll have to provide them to `MLForecast.predict` through the `dynamic_dfs` argument.

By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can pass them as a list to the `dynamic_dfs` argument of `MLForecast.predict`, which will call `pd.DataFrame.merge` on each of them in order.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
prices_catalog

And you have already merged these prices into your series dataframe.

In [None]:
series_with_prices = dynamic_series.reset_index().merge(prices_catalog, how='left')
series_with_prices.set_index('unique_id', inplace=True)
series_with_prices

This dataframe will be passed to `MLForecast.fit` (or `MLForecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
def even_day(dates):
    return dates.day % 2 == 0

models = [
    lgb.LGBMRegressor(n_jobs=1, random_state=0),
    xgb.XGBRegressor(n_jobs=1, random_state=0),
]
fcst = MLForecast(
    models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month', even_day],
    num_threads=2,
)
fcst.fit(
    series_with_prices,
    id_col='index',
    time_col='ds',
    target_col='y',
    static_features=['static_0', 'product_id'],
)

The features used for training are stored in `Forecast.ts.features_order_`, as you can see `price` was used for training.

In [None]:
fcst.ts.features_order_

So in order to update the price in each timestep we just call `Forecast.predict` with our forecast horizon and pass the prices catalog as a dynamic dataframe.

In [None]:
preds = fcst.predict(7, dynamic_dfs=[prices_catalog])
preds

In [None]:
#|hide
preds2 = fcst.predict(7, dynamic_dfs=[prices_catalog])

pd.testing.assert_frame_equal(preds, preds2)

In [None]:
#|hide
non_std_series = series.copy()
non_std_series['ds'] = non_std_series.groupby('unique_id').cumcount()
non_std_series = non_std_series.reset_index().rename(columns={'unique_id': 'some_id', 'ds': 'time', 'y': 'value'})
flow_params = dict(
    models=models,
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    num_threads=2,
)
fcst = MLForecast(**flow_params)
non_std_preds = fcst.fit(non_std_series, id_col='some_id', time_col='time', target_col='value').predict(7)
non_std_preds.index.name = 'unique_id'
fcst = MLForecast(freq='D', **flow_params)
preds = fcst.fit(series, id_col='index', time_col='ds', target_col='y').predict(7)
pd.testing.assert_frame_equal(preds.drop(columns='ds'), non_std_preds.drop(columns='time'))

### Custom predictions
As you may have noticed `MLForecast.predict` can take a `predict_fn` and `predict_fn_kwargs`. By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features you can pass them as a list to `MLForecast.predict` in the `dynamic_dfs` argument. However, if you want to do something else, you can define your own function which will take:

* The trained model.
* The updated features (static + transformations + date features).
* A list of dataframes with the dynamic features.
* The order of the features the model was trained on.
* Additional keyword arguments passed to `MLForecast.predict`.

Here's an example:

Suppose that we want to scale our predictions so that our series are updated with these scaled values. We can achieve that with the following.

In [None]:
def scaling_predict_fn(
    model,
    new_x,
    dynamic_dfs,
    features_order,
    scale_factor,
) -> np.ndarray:
    new_x = new_x[features_order]
    predictions = model.predict(new_x)
    return scale_factor * predictions

And now we just pass this function to `MLForecast.predict`.

In [None]:
fcst = MLForecast(models, freq='D', lags=[7])
fcst.fit(series, id_col='index', time_col='ds', target_col='y')

scale_factor = 1.1
preds = fcst.predict(2, predict_fn=scaling_predict_fn, scale_factor=scale_factor)
preds

In [None]:
#|hide
fcst.ts._predict_setup()

for attr in ('head', 'tail'):
    new_x = fcst.ts._update_features().drop(columns='ds')
    original_preds = fcst.models_[0].predict(new_x)
    
    expected = scale_factor * original_preds
    actual = getattr(preds.groupby('unique_id')[models[0].__class__.__name__], attr)(1).values
    np.testing.assert_equal(expected, actual)
    
    fcst.ts._update_y(actual)

In [None]:
#|hide
n_windows = 2
window_size = 14
fcst = MLForecast(lgb.LGBMRegressor(), freq='D', lags=[7, 14])
backtest_results = fcst.cross_validation(
    non_std_series,
    n_windows,
    window_size,
    id_col='some_id',
    time_col='time',
    target_col='value',
    static_features=['static_0', 'static_1'],
)
renamer = {'some_id': 'unique_id', 'time': 'ds', 'value': 'y'}
backtest_results = backtest_results.rename(columns=renamer).set_index('unique_id')
renamed = non_std_series.rename(columns=renamer).set_index('unique_id')
manual_results = []
for cutoff, train, valid in backtest_splits(renamed, n_windows, window_size, 1):
    fcst.fit(
        train,
        id_col='index',
        time_col='ds',
        target_col='y',
        static_features=['static_0', 'static_1'],
    )
    pred = fcst.predict(window_size)
    res = valid[['ds', 'y']].copy()
    res['cutoff'] = cutoff
    res = res[['ds', 'cutoff', 'y']].copy()
    manual_results.append(res.merge(pred, on=['unique_id', 'ds'], how='left'))
manual_results = pd.concat(manual_results)
pd.testing.assert_frame_equal(backtest_results, manual_results)