In [None]:
#|default_exp distributed.forecast

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

In [None]:
#|hide
from fastcore.test import test_warns
from nbdev import show_doc
from sklearn import set_config

# DistributedMLForecast

> Distributed pipeline encapsulation

**This interface is only tested on Linux**

In [None]:
#|export
import warnings
from typing import Callable, Iterable, List, Optional

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client, default_client
from sklearn.base import clone

from mlforecast.core import (
    DateFeature,
    Differences,
    Freq,
    LagTransforms,
    Lags,
    Models,
    TimeSeries,
    _name_models,
)
from mlforecast.distributed.core import DistributedTimeSeries
from mlforecast.utils import backtest_splits

In [None]:
#|hide
set_config(display='text')
warnings.simplefilter('ignore', UserWarning)

In [None]:
#|export
class DistributedMLForecast:
    """Distributed pipeline encapsulation."""
    
    def __init__(
        self,
        models: Models,
        freq: Optional[Freq] = None,
        lags: Optional[Lags] = None,
        lag_transforms: Optional[LagTransforms] = None,
        date_features: Optional[Iterable[DateFeature]] = None,
        differences: Optional[Differences] = None,
        num_threads: int = 1,
        client: Optional[Client] = None,
    ):
        """Create distributed forecast object

        Parameters
        ----------
        models : regressor or list of regressors
            Models that will be trained and used to compute the forecasts.
        freq : str or int, optional (default=None)
            Pandas offset alias, e.g. 'D', 'W-THU' or integer denoting the frequency of the series.
        lags : list of int, optional (default=None)
            Lags of the target to use as features.
        lag_transforms : dict of int to list of functions, optional (default=None)
            Mapping of target lags to their transformations.
        date_features : list of str or callable, optional (default=None)
            Features computed from the dates. Can be pandas date attributes or functions that will take the dates as input.
        differences : list of int, optional (default=None)
            Differences to take of the target before computing the features. These are restored at the forecasting step.
        num_threads : int (default=1)
            Number of threads to use when computing the features.
        client : dask distributed client
            Client to use for computing data and training the models.        
        """        
        if not isinstance(models, dict) and not isinstance(models, list):
            models = [models]
        if isinstance(models, list):
            model_names = _name_models([m.__class__.__name__ for m in models])            
            models_with_names = dict(zip(model_names, models))
        else:
            models_with_names = models
        self.models = models_with_names
        self.client = client or default_client()
        self.dts = DistributedTimeSeries(
            TimeSeries(
                freq, lags, lag_transforms, date_features, differences, num_threads
            ),
            self.client,
        )
        
    def __repr__(self) -> str:
        return (
            f'{self.__class__.__name__}(models=[{", ".join(self.models.keys())}], '
            f"freq={self.freq}, "
            f"lag_features={list(self.dts._base_ts.transforms.keys())}, "
            f"date_features={self.dts._base_ts.date_features}, "
            f"num_threads={self.dts._base_ts.num_threads}, "
            f"client={self.client})"
        )
    
    @property
    def freq(self):
        return self.dts._base_ts.freq

    def preprocess(
        self,
        data: dd.DataFrame,
        id_col: str = 'index',
        time_col: str = 'ds',
        target_col: str = 'y',
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> dd.DataFrame:
        """Add the features to `data`.
        
        Parameters
        ----------
        data : dask DataFrame
            Series data in long format.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.

        Returns
        -------
        result : dask DataFrame.
            `data` plus added features.
        """
        if id_col in data:
            warnings.warn('It is recommended to have id_col as the index, since setting the index is a slow operation.')
            data = data.set_index(id_col)
            id_col = 'index'
        return self.dts.fit_transform(data, id_col, time_col, target_col, static_features, dropna, keep_last_n)
    
    def fit_models(
        self,
        X: dd.DataFrame,
        y: dd.Series,
    ) -> 'DistributedMLForecast':
        """Manually train models. Use this if you called `Forecast.preprocess` beforehand.
        
        Parameters
        ----------
        X : dask DataFrame
            Features.
        y : dask Series.
            Target.
            
        Returns
        -------
        self : DistributedForecast
            Forecast object with trained models.
        """
        self.models_ = {}
        for name, model in self.models.items():
            self.models_[name] = clone(model).fit(X, y).model_
        return self

    def fit(
        self,
        data: dd.DataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> 'DistributedMLForecast':
        """Apply the feature engineering and train the models.
        
        Parameters
        ----------
        data : dask DataFrame
            Series data in long format.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.

        Returns
        -------
        self : DistributedForecast
            Forecast object with series values and trained models.
        """
        train_ddf = self.preprocess(data, id_col, time_col, target_col, static_features, dropna, keep_last_n)
        X, y = train_ddf.drop(columns=[time_col, target_col]), train_ddf[target_col]        
        self.fit_models(X, y)
        return self

    def predict(
        self,
        horizon: int,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        before_predict_callback: Optional[Callable] = None,
        after_predict_callback: Optional[Callable] = None,
    ) -> dd.DataFrame:
        """Compute the predictions for the next `horizon` steps.
        
        Parameters
        ----------
        horizon : int
            Number of periods to predict.
        dynamic_dfs : list of pandas DataFrame, optional (default=None)
            Future values of the dynamic features, e.g. prices.
        before_predict_callback : callable, optional (default=None)
            Function to call on the features before computing the predictions.
                This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.
                The series identifier is on the index.
        after_predict_callback : callable, optional (default=None)
            Function to call on the predictions before updating the targets.
                This function will take a pandas Series with the predictions and should return another one with the same structure.
                The series identifier is on the index.  
                    
        Returns
        -------
        result : dask DataFrame
            Predictions for each serie and timestep, with one column per model.
        """        
        return self.dts.predict(
            self.models_,
            horizon,
            dynamic_dfs,
            before_predict_callback,
            after_predict_callback,
        )
    
    def cross_validation(
        self,
        data: pd.DataFrame,
        n_windows: int,
        window_size: int,
        id_col: str,
        time_col: str,
        target_col: str,
        step_size: Optional[int] = None, 
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        before_predict_callback: Optional[Callable] = None,
        after_predict_callback: Optional[Callable] = None,
    ):
        """Perform time series cross validation.
        Creates `n_windows` splits where each window has `window_size` test periods, 
        trains the models, computes the predictions and merges the actuals.
        
        Parameters
        ----------
        data : dask DataFrame
            Series data in long format.
        n_windows : int
            Number of windows to evaluate.
        window_size : int
            Number of test periods in each window.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        step_size : int, optional (default=None)
            Step size between each cross validation window. If None it will be equal to `window_size`.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.
        dynamic_dfs : list of pandas DataFrame, optional (default=None)
            Future values of the dynamic features, e.g. prices.
        before_predict_callback : callable, optional (default=None)
            Function to call on the features before computing the predictions.
                This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.
                The series identifier is on the index.
        after_predict_callback : callable, optional (default=None)
            Function to call on the predictions before updating the targets.
                This function will take a pandas Series with the predictions and should return another one with the same structure.
                The series identifier is on the index.               

        Returns
        -------
        result : dask DataFrame
            Predictions for each window with the series id, timestamp, last train date, target value and predictions from each model.
        """
        results = []
        self.cv_models_ = []
        if id_col != 'index':
            data = data.set_index(id_col)

        def renames(df):
            mapper = {time_col: 'ds', target_col: 'y'}
            df = df.rename(columns=mapper, copy=False)
            df.index.name = 'unique_id'
            return df
        data = data.map_partitions(renames)

        if np.issubdtype(data['ds'].dtype.type, np.integer):
            freq = 1
        else:
            freq = self.freq
        for train_end, train, valid in backtest_splits(data, n_windows, window_size, freq, step_size):
            self.fit(train, 'index', 'ds', 'y', static_features, dropna, keep_last_n)
            self.cv_models_.append(self.models_)
            y_pred = self.predict(
                window_size, dynamic_dfs, before_predict_callback, after_predict_callback,
            )
            result = valid[['ds', 'y']].copy()
            result['cutoff'] = train_end
            
            def merge_fn(res, pred):
                return res.merge(pred, on=['unique_id', 'ds'], how='left')
            meta = {**result.dtypes.to_dict(), **y_pred.dtypes.to_dict()}
            result = result.map_partitions(merge_fn, y_pred, align_dataframes=False, meta=meta)
            if id_col != 'index':
                result = result.reset_index()
            result = result.rename(columns={'ds': time_col, 'y': target_col, 'unique_id': id_col})
            results.append(result)

        return dd.concat(results)

In [None]:
#| hide
#| export
class DistributedForecast(DistributedMLForecast):
    def __init__(
        self,
        models: Models,
        freq: Optional[Freq] = None,
        lags: Optional[Lags] = None,
        lag_transforms: Optional[LagTransforms] = None,
        date_features: Optional[Iterable[DateFeature]] = None,
        differences: Optional[Differences] = None,
        num_threads: int = 1,
        client: Optional[Client] = None,
    ):
        warning_msg = (
            'The DistributedForecast class is deprecated and will be removed in a future version, '
            'please use the DistributedMLForecast class instead.'
        )
        warnings.warn(warning_msg, DeprecationWarning)
        super().__init__(models, freq, lags, lag_transforms, date_features, differences, num_threads, client)        

In [None]:
show_doc(DistributedMLForecast)

---

### DistributedMLForecast

>      DistributedMLForecast (models:Union[sklearn.base.BaseEstimator,List[sklea
>                             rn.base.BaseEstimator],Dict[str,sklearn.base.BaseE
>                             stimator]], freq:Union[int,str,NoneType]=None,
>                             lags:Optional[Iterable[int]]=None, lag_transforms:
>                             Optional[Dict[int,List[Union[Callable,Tuple[Callab
>                             le,Any]]]]]=None, date_features:Optional[Iterable[
>                             Union[str,Callable]]]=None,
>                             differences:Optional[Iterable[int]]=None,
>                             num_threads:int=1,
>                             client:Optional[distributed.client.Client]=None)

Distributed pipeline encapsulation.

The `DistributedMLForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

## Example
This shows an example with simulated data, for a real world example in a remote cluster you can check the [M5 distributed example](https://www.kaggle.com/lemuz90/m5-mlforecast-distributed).

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import backtest_splits, generate_daily_series, generate_prices_for_series
from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast

The different things that you need to use `DistributedMLForecast` (as opposed to `MLForecast`) are:

1. You need to set up a `dask.distributed.Client`. If this client is connected to a remote cluster then the process will run there.
2. Your data needs to be a `dask.dataframe.DataFrame`.
3. You need to use a model that implements distributed training (either XGBForecast or LGBMForecast).

### Client setup

Here we define a client that connects to a `dask.distributed.LocalCluster`, however it could be any other kind of cluster.

In [None]:
client = Client(n_workers=2, threads_per_worker=1)

In [None]:
#| hide
test_warns(lambda: DistributedForecast([]))

### Data setup

The data is given as a `dask.dataframe.DataFrame`, you need to make sure that each time serie is only in one partition and it is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` to avoid having nested parallelism.

The required input format is the same as for `MLForecast`, except that it's a `dask.dataframe.DataFrame` instead of a `pandas.Dataframe`.

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True, static_as_categorical=False)
partitioned_series = dd.from_pandas(series, npartitions=10)
partitioned_series

Unnamed: 0_level_0,ds,y,static_0,static_1
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id_00,datetime64[ns],float64,int64,int64
id_10,...,...,...,...
...,...,...,...,...
id_89,...,...,...,...
id_99,...,...,...,...


### Models
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `LGBMForecast` and `XGBForecast` which are just wrappers around `lightgbm.dask.DaskLGBMRegressor` and `xgboost.dask.DaskXGBRegressor` that add a `model_` property to get the trained model from them and send it to every worker to perform the predictions step.

In [None]:
models = [XGBForecast(random_state=0), LGBMForecast(random_state=0)]

### Training
Once we have our models we instantiate a `DistributedForecast` object defining our features.

In [None]:
fcst = DistributedMLForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
fcst

DistributedMLForecast(models=[XGBForecast, LGBMForecast], freq=<Day>, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:44271' processes=2 threads=2, memory=15.50 GiB>)

Here where we say that:

* Our series have daily frequency.
* We want to use lag 7 as a feature
* We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 14
* We want to use dayofweek and month as date features.
* We want to perform the preprocessing and the forecasting steps using 1 thread, because we have 10 partitions and 2 workers.

From this point we have two options:

1. Compute the features and fit our models.
2. Compute the features and get them back as a dataframe to do some custom splitting or adding additional features, then training the models.

#### 1. Using all the data

In [None]:
show_doc(DistributedMLForecast.fit)

---

### DistributedMLForecast.fit

>      DistributedMLForecast.fit (data:dask.dataframe.core.DataFrame,
>                                 id_col:str, time_col:str, target_col:str,
>                                 static_features:Optional[List[str]]=None,
>                                 dropna:bool=True,
>                                 keep_last_n:Optional[int]=None)

Apply the feature engineering and train the models.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | DataFrame |  | Series data in long format. |
| id_col | str |  | Column that identifies each serie. If 'index' then the index is used. |
| time_col | str |  | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str |  | Column that contains the target. |
| static_features | typing.Optional[typing.List[str]] | None | Names of the features that are static and will be repeated when forecasting. |
| dropna | bool | True | Drop rows with missing values produced by the transformations. |
| keep_last_n | typing.Optional[int] | None | Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it. |
| **Returns** | **DistributedMLForecast** |  | **Forecast object with series values and trained models.** |

Calling `fit` on our data computes the features independently for each partition and performs distributed training.

In [None]:
fcst.fit(partitioned_series, id_col='index', time_col='ds', target_col='y')

  client.wait_for_workers(n_workers)
[20:01:39] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:01:40] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 44029...
[LightGBM] [Info] Binding port 44029 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 54951...
[LightGBM] [Info] Binding port 54951 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Local rank: 1, total number of machines: 2


DistributedMLForecast(models=[XGBForecast, LGBMForecast], freq=<Day>, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:44271' processes=2 threads=2, memory=15.50 GiB>)

### Forecasting

In [None]:
show_doc(DistributedMLForecast.predict)

---

### DistributedMLForecast.predict

>      DistributedMLForecast.predict (horizon:int,
>                                     dynamic_dfs:Optional[List[pandas.core.fram
>                                     e.DataFrame]]=None, before_predict_callbac
>                                     k:Optional[Callable]=None, after_predict_c
>                                     allback:Optional[Callable]=None)

Compute the predictions for the next `horizon` steps.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| horizon | int |  | Number of periods to predict. |
| dynamic_dfs | typing.Optional[typing.List[pandas.core.frame.DataFrame]] | None | Future values of the dynamic features, e.g. prices. |
| before_predict_callback | typing.Optional[typing.Callable] | None | Function to call on the features before computing the predictions.<br>    This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.<br>    The series identifier is on the index. |
| after_predict_callback | typing.Optional[typing.Callable] | None | Function to call on the predictions before updating the targets.<br>    This function will take a pandas Series with the predictions and should return another one with the same structure.<br>    The series identifier is on the index.   |
| **Returns** | **DataFrame** |  | **Predictions for each serie and timestep, with one column per model.** |

Once we have our fitted models we can compute the predictions for the next 7 timesteps.

In [None]:
preds = fcst.predict(7)
preds

Unnamed: 0_level_0,ds,XGBForecast,LGBMForecast
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_00,datetime64[ns],float32,float64
id_10,...,...,...
...,...,...,...
id_89,...,...,...
id_99,...,...,...


In [None]:
#|hide
preds = preds.compute()
preds2 = fcst.predict(7).compute()
pd.testing.assert_frame_equal(preds, preds2)

In [None]:
##|hide
non_std_series = partitioned_series.copy()
non_std_series['ds'] = non_std_series.map_partitions(lambda part: part.groupby('unique_id').cumcount())
non_std_series = non_std_series.reset_index().rename(columns={'ds': 'time', 'y': 'value', 'unique_id': 'some_id'})
flow_params = dict(
    models=[XGBForecast(random_state=0)],
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    num_threads=1,
)
fcst = DistributedMLForecast(freq='D', **flow_params)
fcst.fit(partitioned_series, id_col='index', time_col='ds', target_col='y')
preds = fcst.predict(7).compute()
fcst2 = DistributedMLForecast(**flow_params)
fcst2.preprocess(non_std_series, id_col='some_id', time_col='time', target_col='value')
fcst2.models_ = fcst.models_  # distributed training can end up with different fits
non_std_preds = fcst2.predict(7).compute()
non_std_preds.index.name = 'unique_id'
pd.testing.assert_frame_equal(preds.drop(columns='ds'), non_std_preds.drop(columns='time'))

  client.wait_for_workers(n_workers)
[20:01:43] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:01:43] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1


#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `preprocess` with our data.

In [None]:
show_doc(DistributedMLForecast.preprocess)

---

### DistributedMLForecast.preprocess

>      DistributedMLForecast.preprocess (data:dask.dataframe.core.DataFrame,
>                                        id_col:str='index', time_col:str='ds',
>                                        target_col:str='y', static_features:Opt
>                                        ional[List[str]]=None,
>                                        dropna:bool=True,
>                                        keep_last_n:Optional[int]=None)

Add the features to `data`.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | DataFrame |  | Series data in long format. |
| id_col | str | index | Column that identifies each serie. If 'index' then the index is used. |
| time_col | str | ds | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str | y | Column that contains the target. |
| static_features | typing.Optional[typing.List[str]] | None | Names of the features that are static and will be repeated when forecasting. |
| dropna | bool | True | Drop rows with missing values produced by the transformations. |
| keep_last_n | typing.Optional[int] | None | Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it. |
| **Returns** | **DataFrame** |  | **`data` plus added features.** |

In [None]:
features_ddf = fcst.preprocess(partitioned_series)
features_ddf.head()

Unnamed: 0_level_0,ds,y,static_0,static_1,lag7,expanding_mean_lag1,rolling_mean_lag7_window_size14
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id_00,2000-10-25,49.766844,79,45,50.694639,25.001367,26.32006
id_00,2000-10-26,3.918347,79,45,3.88778,26.180675,26.313387
id_00,2000-10-27,9.437778,79,45,11.512774,25.168751,26.398056
id_00,2000-10-28,17.923574,79,45,18.038498,24.484796,26.425272
id_00,2000-10-29,26.754645,79,45,24.222859,24.211411,26.305563


This is useful if we want to inspect the data the model will be trained. If we do this we must call `fit_models` to train our models.

In [None]:
X, y = features_ddf.drop(columns=['ds', 'y']), features_ddf['y']
fcst.fit_models(X, y)

  client.wait_for_workers(n_workers)
[20:01:45] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:01:46] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1


DistributedMLForecast(models=[XGBForecast], freq=<Day>, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=[], num_threads=1, client=<Client: 'tcp://127.0.0.1:44271' processes=2 threads=2, memory=15.50 GiB>)

In [None]:
#| hide
fcst.models_ = fcst2.models_

In [None]:
preds2 = fcst.predict(7).compute()
pd.testing.assert_frame_equal(preds, preds2)

#### Dynamic features

By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can pass them as a list to the `dynamic_dfs` argument of `DistributedMLForecast.predict`, which will call `pd.DataFrame.merge` on each of them in order.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
prices_catalog

Unnamed: 0,ds,product_id,price
0,2000-06-09,1,0.548814
1,2000-06-10,1,0.715189
2,2000-06-11,1,0.602763
3,2000-06-12,1,0.544883
4,2000-06-13,1,0.423655
...,...,...,...
20180,2001-05-17,99,0.223520
20181,2001-05-18,99,0.446104
20182,2001-05-19,99,0.044783
20183,2001-05-20,99,0.483216


And you have already merged these prices into your series dataframe.

In [None]:
dynamic_series = partitioned_series.rename(columns={'static_1': 'product_id'})
dynamic_series = dynamic_series.reset_index()
series_with_prices = dynamic_series.merge(prices_catalog, how='left')
series_with_prices = series_with_prices.set_index('unique_id', sorted=True)
series_with_prices.head()

Unnamed: 0_level_0,ds,y,static_0,product_id,price
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_00,2000-10-05,3.981198,79,45,0.570826
id_00,2000-10-06,10.327401,79,45,0.260562
id_00,2000-10-07,17.657474,79,45,0.274048
id_00,2000-10-08,25.89879,79,45,0.433878
id_00,2000-10-09,34.49404,79,45,0.653738


This dataframe will be passed to `DistributedMLForecast.fit` (or `DistributedMLForecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
fcst = DistributedMLForecast(
    models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
series_with_prices = series_with_prices
fcst.fit(
    series_with_prices,
    id_col='index',
    time_col='ds',
    target_col='y',
    static_features=['static_0', 'product_id'],
)

  client.wait_for_workers(n_workers)
[20:01:47] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:01:47] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 42619...
[LightGBM] [Info] Trying to bind port 38827...
[LightGBM] [Info] Binding port 38827 succeeded
[LightGBM] [Info] Binding port 42619 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


DistributedMLForecast(models=[XGBForecast, LGBMForecast], freq=<Day>, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:44271' processes=2 threads=2, memory=15.50 GiB>)

So in order to update the price in each timestep we just call `DistributedForecast.predict` with our forecast horizon and pass the prices catalog as a dynamic dataframe.

In [None]:
preds = fcst.predict(7, dynamic_dfs=[prices_catalog])
preds.compute()

Unnamed: 0_level_0,ds,XGBForecast,LGBMForecast
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_00,2001-05-15,42.222416,42.691265
id_00,2001-05-16,50.424877,50.089973
id_00,2001-05-17,1.957672,1.973238
id_00,2001-05-18,9.915949,10.105702
id_00,2001-05-19,18.617931,18.607521
...,...,...,...
id_99,2001-05-17,43.245510,44.233036
id_99,2001-05-18,2.138254,2.065753
id_99,2001-05-19,9.049370,9.062783
id_99,2001-05-20,15.213605,15.279421


#### Custom predictions

If you want to do something like scaling the predictions you can define a function and pass it to `DistributedMLForecast.predict` as described in <a href="/mlforecast/forecast.html#Custom-predictions">Custom predictions</a>.

### Cross validation
Refer to `MLForecast.cross_validation`.

In [None]:
show_doc(DistributedMLForecast.cross_validation)

---

### DistributedMLForecast.cross_validation

>      DistributedMLForecast.cross_validation (data:pandas.core.frame.DataFrame,
>                                              n_windows:int, window_size:int,
>                                              id_col:str, time_col:str,
>                                              target_col:str,
>                                              step_size:Optional[int]=None, sta
>                                              tic_features:Optional[List[str]]=
>                                              None, dropna:bool=True,
>                                              keep_last_n:Optional[int]=None, d
>                                              ynamic_dfs:Optional[List[pandas.c
>                                              ore.frame.DataFrame]]=None, befor
>                                              e_predict_callback:Optional[Calla
>                                              ble]=None, after_predict_callback
>                                              :Optional[Callable]=None)

Perform time series cross validation.
Creates `n_windows` splits where each window has `window_size` test periods, 
trains the models, computes the predictions and merges the actuals.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | DataFrame |  | Series data in long format. |
| n_windows | int |  | Number of windows to evaluate. |
| window_size | int |  | Number of test periods in each window. |
| id_col | str |  | Column that identifies each serie. If 'index' then the index is used. |
| time_col | str |  | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str |  | Column that contains the target. |
| step_size | typing.Optional[int] | None | Step size between each cross validation window. If None it will be equal to `window_size`. |
| static_features | typing.Optional[typing.List[str]] | None | Names of the features that are static and will be repeated when forecasting. |
| dropna | bool | True | Drop rows with missing values produced by the transformations. |
| keep_last_n | typing.Optional[int] | None | Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it. |
| dynamic_dfs | typing.Optional[typing.List[pandas.core.frame.DataFrame]] | None | Future values of the dynamic features, e.g. prices. |
| before_predict_callback | typing.Optional[typing.Callable] | None | Function to call on the features before computing the predictions.<br>    This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.<br>    The series identifier is on the index. |
| after_predict_callback | typing.Optional[typing.Callable] | None | Function to call on the predictions before updating the targets.<br>    This function will take a pandas Series with the predictions and should return another one with the same structure.<br>    The series identifier is on the index.                |
| **Returns** | **dask DataFrame** |  | **Predictions for each window with the series id, timestamp, last train date, target value and predictions from each model.** |

In [None]:
n_windows = 2
window_size = 14

cv_results = fcst.cross_validation(
    partitioned_series,
    n_windows,
    window_size,
    id_col='index',
    time_col='ds',
    target_col='y',
)
cv_results

  client.wait_for_workers(n_workers)
[20:01:54] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:01:54] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 39961...
[LightGBM] [Info] Binding port 39961 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 43493...
[LightGBM] [Info] Binding port 43493 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


  client.wait_for_workers(n_workers)
[20:01:57] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:01:57] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 57949...
[LightGBM] [Info] Binding port 57949 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 40245...
[LightGBM] [Info] Binding port 40245 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


Unnamed: 0_level_0,ds,y,cutoff,XGBForecast,LGBMForecast
npartitions=20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],float64,datetime64[ns],float32,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = cv_results.compute().groupby('ds').mean()
agg_results.head()

Unnamed: 0_level_0,y,XGBForecast,LGBMForecast
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-04-17,16.123231,16.188124,16.184217
2001-04-18,15.21392,15.153634,15.135103
2001-04-19,16.985699,17.114965,17.134767
2001-04-20,18.06834,18.045998,18.009906
2001-04-21,18.200609,18.134045,18.115785


We can also compute the error for each model.

In [None]:
def mse_from_dask_dataframe(ddf):
    mses = {}
    for model_name in ddf.columns.drop(['ds', 'y', 'cutoff']):
        mses[model_name] = (ddf['y'] - ddf[model_name]).pow(2).mean()
    return client.gather(client.compute(mses))

{k: round(v, 2) for k, v in mse_from_dask_dataframe(cv_results).items()}

{'XGBForecast': 0.89, 'LGBMForecast': 0.92}

In [None]:
#|hide
fcst = DistributedMLForecast(XGBForecast(random_state=0), lags=[7, 14])
backtest_results = fcst.cross_validation(
    non_std_series,
    n_windows,
    window_size,
    id_col='some_id',
    time_col='time',
    target_col='value',
    static_features=['static_0', 'static_1'],    
).compute()
renamer = {'some_id': 'unique_id', 'time': 'ds', 'value': 'y'}
backtest_results = backtest_results.rename(columns=renamer).set_index('unique_id')
renamed = non_std_series.rename(columns=renamer).set_index('unique_id')
cv_models = fcst.cv_models_
manual_results = []
for i, (cutoff, train, valid) in enumerate(backtest_splits(renamed, n_windows, window_size, 1)):
    fcst.preprocess(train)
    fcst.models_ = cv_models[i]
    pred = fcst.predict(window_size).compute()
    res = valid[['ds', 'y']].compute()
    res['cutoff'] = cutoff
    res = res.merge(pred, on=['unique_id', 'ds'], how='left')
    manual_results.append(res)
manual_results = pd.concat(manual_results)
pd.testing.assert_frame_equal(backtest_results, manual_results)

  client.wait_for_workers(n_workers)
[20:02:01] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:02:01] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1
  client.wait_for_workers(n_workers)
[20:02:02] task [xgboost.dask-0]:tcp://127.0.0.1:43841 got new rank 0
[20:02:02] task [xgboost.dask-1]:tcp://127.0.0.1:39221 got new rank 1


In [None]:
client.close()