In [None]:
#|default_exp distributed.fugue

In [None]:
#|hide
import warnings

from nbdev import show_doc
from sklearn import set_config

In [None]:
#|hide
warnings.simplefilter('ignore', FutureWarning)
set_config(display='text')

# DistributedMLForecast

> Distributed pipeline encapsulation

**This interface is only tested on Linux**

In [None]:
#|export
import copy
from collections import namedtuple
from typing import Any, Callable, Dict, Iterable, List, Optional, Union

import cloudpickle
try:
    import dask.dataframe as dd
    DASK_INSTALLED = True
except ModuleNotFoundError:
    DASK_INSTALLED = False
import fugue
import fugue.api as fa
import pandas as pd
try:
    from pyspark.ml.feature import VectorAssembler
    from pyspark.sql import DataFrame as SparkDataFrame
    SPARK_INSTALLED = True
except ModuleNotFoundError:
    SPARK_INSTALLED = False
from sklearn.base import BaseEstimator, clone

from mlforecast.core import (
    DateFeature,
    Differences,
    Freq,
    LagTransforms,
    Lags,
    TimeSeries,
    _name_models,
)

In [None]:
#|exporti
WindowInfo = namedtuple('WindowInfo', ['n_windows', 'window_size', 'step_size', 'i_window'])

In [None]:
#|export
class DistributedMLForecast:
    """Multi backend distributed pipeline"""
    
    def __init__(
        self,
        models,
        freq: Optional[Freq] = None,
        lags: Optional[Lags] = None,
        lag_transforms: Optional[LagTransforms] = None,
        date_features: Optional[Iterable[DateFeature]] = None,
        differences: Optional[Differences] = None,
        num_threads: int = 1,
        engine = None,
    ):
        """Create distributed forecast object

        Parameters
        ----------
        models : regressor or list of regressors
            Models that will be trained and used to compute the forecasts.
        freq : str or int, optional (default=None)
            Pandas offset alias, e.g. 'D', 'W-THU' or integer denoting the frequency of the series.
        lags : list of int, optional (default=None)
            Lags of the target to use as features.
        lag_transforms : dict of int to list of functions, optional (default=None)
            Mapping of target lags to their transformations.
        date_features : list of str or callable, optional (default=None)
            Features computed from the dates. Can be pandas date attributes or functions that will take the dates as input.
        differences : list of int, optional (default=None)
            Differences to take of the target before computing the features. These are restored at the forecasting step.
        num_threads : int (default=1)
            Number of threads to use when computing the features.
        engine : fugue execution engine, optional (default=None)
            Dask Client, Spark Session, etc to use for the distributed computation.
            If None will use default depending on input type.
        """        
        if not isinstance(models, dict) and not isinstance(models, list):
            models = [models]
        if isinstance(models, list):
            model_names = _name_models([m.__class__.__name__ for m in models])
            models_with_names = dict(zip(model_names, models))
        else:
            models_with_names = models
        self.models = models_with_names
        self._base_ts = TimeSeries(
            freq, lags, lag_transforms, date_features, differences, num_threads
        )
        self.engine = engine
        
    def __repr__(self) -> str:
        return (
            f'{self.__class__.__name__}(models=[{", ".join(self.models.keys())}], '
            f"freq={self._base_ts.freq}, "
            f"lag_features={list(self._base_ts.transforms.keys())}, "
            f"date_features={self._base_ts.date_features}, "
            f"num_threads={self._base_ts.num_threads}, "
            f"engine={self.engine})"
        )

    @staticmethod
    def _preprocess_partition(
        part: pd.DataFrame,
        base_ts: TimeSeries,        
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        window_info: Optional[WindowInfo] = None,
        ts_only: bool = False,
    ) -> List[List[Any]]:
        ts = copy.deepcopy(base_ts)
        if ts_only:
            ts._fit(
                part,
                id_col=id_col,
                time_col=time_col,
                target_col=target_col,
                static_features=static_features,
                keep_last_n=keep_last_n,                
            )
            return [[cloudpickle.dumps(ts), cloudpickle.dumps(None), cloudpickle.dumps(None)]]        
        if window_info is None:
            train = part
            valid = None
        else:
            n_windows, window_size, step_size, i_window = window_info
            if step_size is None:
                step_size = window_size
            test_size = window_size + step_size * (n_windows - 1)
            offset = test_size - i_window * step_size
            max_dates = part.groupby(id_col)[time_col].transform('max')
            train_ends = max_dates - offset * base_ts.freq
            valid_ends = train_ends + window_size * base_ts.freq
            train_mask = part[time_col].le(train_ends)
            valid_mask = part[time_col].gt(train_ends) & part[time_col].le(valid_ends)
            train = part[train_mask]
            valid_keep_cols = part.columns
            if static_features is not None:
                valid_keep_cols.drop(static_features)
            valid = part.loc[valid_mask, valid_keep_cols]
        transformed = ts.fit_transform(
            train,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            static_features=static_features,
            dropna=dropna,
            keep_last_n=keep_last_n,
        )
        return [[cloudpickle.dumps(ts), cloudpickle.dumps(transformed), cloudpickle.dumps(valid)]]

    @staticmethod
    def _retrieve_df(items: List[List[Any]]) -> Iterable[pd.DataFrame]:
        for _, serialized_train, _ in items:
            yield cloudpickle.loads(serialized_train)
            
    def _preprocess_partitions(
        self,
        data: fugue.AnyDataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        window_info: Optional[WindowInfo] = None,
    ) -> List[Any]:
        return fa.transform(
            data,
            DistributedMLForecast._preprocess_partition,
            params={
                'base_ts': self._base_ts,
                'id_col': id_col,
                'time_col': time_col,
                'target_col': target_col,
                'static_features': static_features,
                'dropna': dropna,
                'keep_last_n': keep_last_n,
                'window_info': window_info,
            },
            schema='ts:binary,train:binary,valid:binary',
            engine=self.engine,
            as_fugue=True,
        )        

    def _preprocess(
        self,
        data: fugue.AnyDataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        window_info: Optional[WindowInfo] = None,
    ) -> fugue.AnyDataFrame:
        self.id_col = id_col
        self.time_col = time_col
        self.target_col = target_col
        self.static_features = static_features
        self.dropna = dropna
        self.keep_last_n = keep_last_n
        self.partition_results = self._preprocess_partitions(
            data=data,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            static_features=static_features,
            dropna=dropna,
            keep_last_n=keep_last_n,
            window_info=window_info,
        )
        base_schema = str(fa.get_schema(data))
        features_schema = ','.join(f'{feat}:double' for feat in self._base_ts.features)
        res = fa.transform(
            self.partition_results,
            DistributedMLForecast._retrieve_df,
            schema=f'{base_schema},{features_schema}',
            engine=self.engine,
        )
        return fa.get_native_as_df(res)
    
    def preprocess(
        self,
        data: fugue.AnyDataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> fugue.AnyDataFrame:
        """Add the features to `data`.

        Parameters
        ----------
        data : dask or spark DataFrame.
            Series data in long format.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.

        Returns
        -------
        result : same type as input
            data with added features.
        """        
        return self._preprocess(
            data,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            static_features=static_features,
            dropna=dropna,
            keep_last_n=keep_last_n,
        )
    
    def _fit(
        self,
        data: fugue.AnyDataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        window_info: Optional[WindowInfo] = None,
    ) -> 'DistributedMLForecast':
        prep = self._preprocess(
            data,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            static_features=static_features,
            dropna=dropna,
            keep_last_n=keep_last_n,
            window_info=window_info,
        )
        features = [x for x in prep.columns if x not in {id_col, time_col, target_col}]
        self.models_ = {}
        if SPARK_INSTALLED and isinstance(data, SparkDataFrame):
            try:
                import lightgbm as lgb
                from synapse.ml.lightgbm import LightGBMRegressor as SynapseLGBMRegressor
                LGBM_INSTALLED = True
            except ModuleNotFoundError:
                LGBM_INSTALLED = False
            try:
                import xgboost as xgb
                from xgboost.spark import SparkXGBRegressor  # type: ignore
                XGB_INSTALLED = True
            except ModuleNotFoundError:
                XGB_INSTALLED = False

            featurizer = VectorAssembler(inputCols=features, outputCol="features")
            train_data = featurizer.transform(prep)[target_col, "features"]
            for name, model in self.models.items():
                if LGBM_INSTALLED and isinstance(model, SynapseLGBMRegressor):
                    trained_model = model.setLabelCol(target_col).fit(train_data)
                    model_str = trained_model.getNativeModel()
                    local_model = lgb.Booster(model_str=model_str)                    
                elif XGB_INSTALLED and isinstance(model, SparkXGBRegressor):
                    model.setParams(label_col=target_col)
                    trained_model = model.fit(train_data)
                    model_str = trained_model.get_booster().save_raw('ubj')
                    local_model = xgb.XGBRegressor()
                    local_model.load_model(model_str)
                else:
                    raise ValueError('Only LightGBMRegressor from SynapseML and SparkXGBRegressor are supported in spark.')
                self.models_[name] = local_model
        elif DASK_INSTALLED and isinstance(data, dd.DataFrame):
            try:
                from mlforecast.distributed.models.lgb import LGBMForecast
                LGBM_INSTALLED = True
            except ModuleNotFoundError:
                LGBM_INSTALLED = False
            try:
                from mlforecast.distributed.models.xgb import XGBForecast
                XGB_INSTALLED = True
            except ModuleNotFoundError:
                XGB_INSTALLED = False
            X, y = prep[features], prep[target_col]
            for name, model in self.models.items():
                if not ((LGBM_INSTALLED and isinstance(model, LGBMForecast)) or (XGB_INSTALLED and isinstance(model, XGBForecast))):
                    raise ValueError('Models must be either LGBMForecast or XGBForecast with dask backend.')
                self.models_[name] = clone(model).fit(X, y).model_
        else:
            raise NotImplementedError('Only spark and dask engines are supported.')
        return self
    
    def fit(
        self,
        data: fugue.AnyDataFrame,
        id_col: str,
        time_col: str,
        target_col: str,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,        
    ) -> 'DistributedMLForecast':
        """Apply the feature engineering and train the models.

        Parameters
        ----------
        data : dask or spark DataFrame
            Series data in long format.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.

        Returns
        -------
        self : DistributedMLForecast
            Forecast object with series values and trained models.
        """        
        return self._fit(
            data,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            static_features=static_features,
            dropna=dropna,
            keep_last_n=keep_last_n,
        )

    @staticmethod
    def _predict(
        items: List[List[Any]],
        models,
        horizon,
        dynamic_dfs,
        before_predict_callback,
        after_predict_callback,
    ) -> Iterable[pd.DataFrame]:
        for serialized_ts, _, serialized_valid in items:
            valid = cloudpickle.loads(serialized_valid)
            ts = cloudpickle.loads(serialized_ts)
            if valid is not None:
                dynamic_features = valid.columns.drop(
                    [ts.id_col, ts.time_col, ts.target_col]
                )
                if not dynamic_features.empty:
                    dynamic_dfs = [valid.drop(columns=ts.target_col)]
            res = ts.predict(
                models=models,
                horizon=horizon,
                dynamic_dfs=dynamic_dfs,
                before_predict_callback=before_predict_callback,
                after_predict_callback=after_predict_callback,
            ).reset_index()
            if valid is not None:
                res = res.merge(valid, how='left')
            yield res

    def predict(
        self,
        horizon: int,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        before_predict_callback: Optional[Callable] = None,
        after_predict_callback: Optional[Callable] = None,
        new_data: Optional[pd.DataFrame] = None,
    ) -> fugue.AnyDataFrame:
        """Compute the predictions for the next `horizon` steps.

        Parameters
        ----------
        horizon : int
            Number of periods to predict.
        dynamic_dfs : list of pandas DataFrame, optional (default=None)
            Future values of the dynamic features, e.g. prices.
        before_predict_callback : callable, optional (default=None)
            Function to call on the features before computing the predictions.
                This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.
                The series identifier is on the index.
        after_predict_callback : callable, optional (default=None)
            Function to call on the predictions before updating the targets.
                This function will take a pandas Series with the predictions and should return another one with the same structure.
                The series identifier is on the index.
        new_data : dask or spark DataFrame, optional (default=None)
            Series data of new observations for which forecasts are to be generated.
                This dataframe should have the same structure as the one used to fit the model, including any features and time series data.
                If `new_data` is not None, the method will generate forecasts for the new observations.                

        Returns
        -------
        result : dask or spark DataFrame
            Predictions for each serie and timestep, with one column per model.
        """        
        model_names = self.models.keys()
        models_schema = ','.join(f'{model_name}:double' for model_name in model_names)
        schema = f'{self.id_col}:string,{self.time_col}:datetime,' + models_schema
        if getattr(self, '_n_windows', None) is not None:
            schema += f',{self.target_col}:double'
        if new_data is not None:
            partition_results = self._preprocess_partitions(
                data=new_data,
                id_col=self.id_col,
                time_col=self.time_col,
                target_col=self.target_col,
                static_features=self.static_features,
                dropna=self.dropna,
                keep_last_n=self.keep_last_n,
            )
        else:
            partition_results = self.partition_results
        res = fa.transform(
            partition_results,
            DistributedMLForecast._predict,
            params={
                'models': self.models_,
                'horizon': horizon,
                'dynamic_dfs': dynamic_dfs,
                'before_predict_callback': before_predict_callback,
                'after_predict_callback': after_predict_callback,
            },
            schema=schema,
            engine=self.engine,
        )
        return fa.get_native_as_df(res)

    def cross_validation(
        self,
        data: fugue.AnyDataFrame,
        n_windows: int,
        window_size: int,
        id_col: str,
        time_col: str,
        target_col: str,
        step_size: Optional[int] = None, 
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        before_predict_callback: Optional[Callable] = None,
        after_predict_callback: Optional[Callable] = None,
    ) -> fugue.AnyDataFrame:
        """Perform time series cross validation.
        Creates `n_windows` splits where each window has `window_size` test periods,
        trains the models, computes the predictions and merges the actuals.

        Parameters
        ----------
        data : dask DataFrame
            Series data in long format.
        n_windows : int
            Number of windows to evaluate.
        window_size : int
            Number of test periods in each window.
        id_col : str
            Column that identifies each serie. If 'index' then the index is used.
        time_col : str
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str
            Column that contains the target.
        step_size : int, optional (default=None)
            Step size between each cross validation window. If None it will be equal to `window_size`.
        static_features : list of str, optional (default=None)
            Names of the features that are static and will be repeated when forecasting.
        dropna : bool (default=True)
            Drop rows with missing values produced by the transformations.
        keep_last_n : int, optional (default=None)
            Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it.
        before_predict_callback : callable, optional (default=None)
            Function to call on the features before computing the predictions.
                This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.
                The series identifier is on the index.
        after_predict_callback : callable, optional (default=None)
            Function to call on the predictions before updating the targets.
                This function will take a pandas Series with the predictions and should return another one with the same structure.
                The series identifier is on the index.

        Returns
        -------
        result : dask or spark DataFrame
            Predictions for each window with the series id, timestamp, target value and predictions from each model.
        """            
        self.cv_models_ = []
        results = []
        for i in range(n_windows):
            window_info = WindowInfo(n_windows, window_size, step_size, i)
            self._fit(
                data,
                id_col=id_col,
                time_col=time_col,
                target_col=target_col,
                static_features=static_features,
                dropna=dropna,
                keep_last_n=keep_last_n,
                window_info=window_info,
            )
            self.cv_models_.append(self.models_)
            preds = self.predict(
                window_size,
                before_predict_callback=before_predict_callback,
                after_predict_callback=after_predict_callback,
            )
            results.append(preds)
        if len(results) == 1:
            return results[0]
        if len(results) == 2:
            return fa.union(results[0], results[1])
        return fa.union(results[0], results[1], results[2:])

In [None]:
show_doc(DistributedMLForecast)

---

### DistributedMLForecast

>      DistributedMLForecast (models,
>                             freq:Union[int,str,pandas._libs.tslibs.offsets.Bas
>                             eOffset,NoneType]=None,
>                             lags:Optional[Iterable[int]]=None, lag_transforms:
>                             Optional[Dict[int,List[Union[Callable,Tuple[Callab
>                             le,Any]]]]]=None, date_features:Optional[Iterable[
>                             Union[str,Callable]]]=None,
>                             differences:Optional[Iterable[int]]=None,
>                             num_threads:int=1, engine=None)

Multi backend distributed pipeline

The `DistributedMLForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

## Example
This shows an example with simulated data.

`<<<<<<< HEAD`

In [None]:
import warnings

from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import backtest_splits, generate_daily_series, generate_prices_for_series
from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast

`=======`

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from dask.distributed import Client
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import backtest_splits, generate_daily_series, generate_prices_for_series
from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast

`>>>>>>> THEIRS`

The different things that you need to use `DistributedMLForecast` (as opposed to `MLForecast`) are:

1. You need to set up a cluster. We currently support dask and spark (ray is on the roadmap).
2. Your data needs to be a distributed collection. We currently support dask and spark dataframes.
3. You need to use a model that implements distributed training in your framework of choice, e.g. SynapseML for LightGBM in spark.

### Dask

#### Client setup

In [None]:
client = Client(n_workers=2, threads_per_worker=1)

Here we define a client that connects to a `dask.distributed.LocalCluster`, however it could be any other kind of cluster.

### Data setup

For dask, the data must be a `dask.dataframe.DataFrame`. You need to make sure that each time serie is only in one partition and it is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` to avoid having nested parallelism.

The required input format is the same as for `MLForecast`, except that it's a `dask.dataframe.DataFrame` instead of a `pandas.Dataframe`.

`<<<<<<< HEAD`

In [None]:
warnings.simplefilter('ignore', FutureWarning)

`=======`

`>>>>>>> THEIRS`

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True, static_as_categorical=False)
partitioned_series = dd.from_pandas(series, npartitions=10).map_partitions(lambda df: df.reset_index())
partitioned_series['unique_id'] = partitioned_series['unique_id'].astype(str)
partitioned_series

Unnamed: 0_level_0,unique_id,ds,y,static_0,static_1
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_00,object,datetime64[ns],float64,int64,int64
id_10,...,...,...,...,...
...,...,...,...,...,...
id_89,...,...,...,...,...
id_99,...,...,...,...,...


#### Models
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `LGBMForecast` and `XGBForecast` which are just wrappers around `lightgbm.dask.DaskLGBMRegressor` and `xgboost.dask.DaskXGBRegressor` that add a `model_` property to get the trained model from them and send it to every worker to perform the predictions step.

In [None]:
models = [XGBForecast(random_state=0), LGBMForecast(random_state=0)]

### Training
Once we have our models we instantiate a `DistributedMLForecast` object defining our features.

In [None]:
fcst = DistributedMLForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
    engine=client,
)
fcst

DistributedMLForecast(models=[XGBForecast, LGBMForecast], freq=<Day>, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=1, engine=<Client: 'tcp://127.0.0.1:44923' processes=2 threads=2, memory=15.50 GiB>)

Here where we say that:

* Our series have daily frequency.
* We want to use lag 7 as a feature
* We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 14
* We want to use dayofweek and month as date features.
* We want to perform the preprocessing and the forecasting steps using 1 thread, because we have 10 partitions and 2 workers.

From this point we have two options:

1. Compute the features and fit our models.
2. Compute the features and get them back as a dataframe to do some custom splitting or adding additional features, then training the models.

#### 1. Using all the data

In [None]:
show_doc(DistributedMLForecast.fit)

---

### DistributedMLForecast.fit

>      DistributedMLForecast.fit (data:~AnyDataFrame, id_col:str, time_col:str,
>                                 target_col:str,
>                                 static_features:Optional[List[str]]=None,
>                                 dropna:bool=True,
>                                 keep_last_n:Optional[int]=None)

Apply the feature engineering and train the models.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | AnyDataFrame |  | Series data in long format. |
| id_col | str |  | Column that identifies each serie. If 'index' then the index is used. |
| time_col | str |  | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str |  | Column that contains the target. |
| static_features | Optional | None | Names of the features that are static and will be repeated when forecasting. |
| dropna | bool | True | Drop rows with missing values produced by the transformations. |
| keep_last_n | Optional | None | Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it. |
| **Returns** | **DistributedMLForecast** |  | **Forecast object with series values and trained models.** |

Calling `fit` on our data computes the features independently for each partition and performs distributed training.

In [None]:
fcst.fit(partitioned_series, id_col='unique_id', time_col='ds', target_col='y')

### Forecasting

In [None]:
show_doc(DistributedMLForecast.predict)

---

### DistributedMLForecast.predict

>      DistributedMLForecast.predict (horizon:int,
>                                     dynamic_dfs:Optional[List[pandas.core.fram
>                                     e.DataFrame]]=None, before_predict_callbac
>                                     k:Optional[Callable]=None, after_predict_c
>                                     allback:Optional[Callable]=None, new_data:
>                                     Optional[pandas.core.frame.DataFrame]=None
>                                     )

Compute the predictions for the next `horizon` steps.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| horizon | int |  | Number of periods to predict. |
| dynamic_dfs | Optional | None | Future values of the dynamic features, e.g. prices. |
| before_predict_callback | Optional | None | Function to call on the features before computing the predictions.<br>    This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.<br>    The series identifier is on the index. |
| after_predict_callback | Optional | None | Function to call on the predictions before updating the targets.<br>    This function will take a pandas Series with the predictions and should return another one with the same structure.<br>    The series identifier is on the index. |
| new_data | Optional | None | Series data of new observations for which forecasts are to be generated.<br>    This dataframe should have the same structure as the one used to fit the model, including any features and time series data.<br>    If `new_data` is not None, the method will generate forecasts for the new observations.                 |
| **Returns** | **AnyDataFrame** |  | **Predictions for each serie and timestep, with one column per model.** |

Once we have our fitted models we can compute the predictions for the next 7 timesteps.

In [None]:
preds = fcst.predict(7)
preds

Unnamed: 0_level_0,unique_id,ds,XGBForecast,LGBMForecast
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id_00,object,datetime64[ns],float64,float64
id_10,...,...,...,...
...,...,...,...,...
id_89,...,...,...,...
id_99,...,...,...,...


In [None]:
#|hide
preds = preds.compute()
preds2 = fcst.predict(7).compute()
preds3 = fcst.predict(7, new_data=partitioned_series).compute()
pd.testing.assert_frame_equal(preds, preds2)
pd.testing.assert_frame_equal(preds, preds3)

In [None]:
#|hide
non_std_series = partitioned_series.copy()
non_std_series['ds'] = non_std_series.map_partitions(lambda part: part.groupby('unique_id').cumcount())
non_std_series = non_std_series.rename(columns={'ds': 'time', 'y': 'value', 'unique_id': 'some_id'})
flow_params = dict(
    models=[XGBForecast(random_state=0)],
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    num_threads=1,
)
fcst = DistributedMLForecast(freq='D', **flow_params)
fcst.fit(partitioned_series, id_col='unique_id', time_col='ds', target_col='y')
preds = fcst.predict(7).compute()
fcst2 = DistributedMLForecast(**flow_params)
fcst2.preprocess(non_std_series, id_col='some_id', time_col='time', target_col='value')
fcst2.models_ = fcst.models_  # distributed training can end up with different fits
non_std_preds = fcst2.predict(7).compute()
pd.testing.assert_frame_equal(
    preds.drop(columns='ds'),
    non_std_preds.drop(columns='time').rename(columns={'some_id': 'unique_id'})
)

#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `preprocess` with our data.

In [None]:
show_doc(DistributedMLForecast.preprocess)

---

### DistributedMLForecast.preprocess

>      DistributedMLForecast.preprocess (data:~AnyDataFrame, id_col:str,
>                                        time_col:str, target_col:str, static_fe
>                                        atures:Optional[List[str]]=None,
>                                        dropna:bool=True,
>                                        keep_last_n:Optional[int]=None)

Add the features to `data`.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | AnyDataFrame |  | Series data in long format. |
| id_col | str |  | Column that identifies each serie. If 'index' then the index is used. |
| time_col | str |  | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str |  | Column that contains the target. |
| static_features | Optional | None | Names of the features that are static and will be repeated when forecasting. |
| dropna | bool | True | Drop rows with missing values produced by the transformations. |
| keep_last_n | Optional | None | Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it. |
| **Returns** | **AnyDataFrame** |  | **data with added features.** |

In [None]:
features_ddf = fcst.preprocess(partitioned_series, id_col='unique_id', time_col='ds', target_col='y')
features_ddf.head()

Unnamed: 0,unique_id,ds,y,static_0,static_1,lag7,expanding_mean_lag1,rolling_mean_lag7_window_size14
20,id_00,2000-10-25,49.766844,79,45,50.694639,25.001367,26.32006
21,id_00,2000-10-26,3.918347,79,45,3.88778,26.180675,26.313387
22,id_00,2000-10-27,9.437778,79,45,11.512774,25.168751,26.398056
23,id_00,2000-10-28,17.923574,79,45,18.038498,24.484796,26.425272
24,id_00,2000-10-29,26.754645,79,45,24.222859,24.211411,26.305563


This is useful if we want to inspect the data the model will be trained. If we do this we must manually train our models and add a local version of them to the `models_` attribute.

In [None]:
X, y = features_ddf.drop(columns=['unique_id', 'ds', 'y']), features_ddf['y']
model = XGBForecast(random_state=0).fit(X, y)
fcst.models_ = {'XGBForecast': model.model_}
fcst.predict(7)

In [None]:
#| hide
fcst.models_ = fcst2.models_
preds2 = fcst.predict(7).compute()
pd.testing.assert_frame_equal(preds, preds2)

#### Dynamic features
By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can pass them as a list to the `dynamic_dfs` argument of `DistributedMLForecast.predict`, which will call `pd.DataFrame.merge` on each of them in order.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
prices_catalog

Unnamed: 0,ds,product_id,price
0,2000-06-09,1,0.548814
1,2000-06-10,1,0.715189
2,2000-06-11,1,0.602763
3,2000-06-12,1,0.544883
4,2000-06-13,1,0.423655
...,...,...,...
20180,2001-05-17,99,0.223520
20181,2001-05-18,99,0.446104
20182,2001-05-19,99,0.044783
20183,2001-05-20,99,0.483216


And you have already merged these prices into your series dataframe.

In [None]:
dynamic_series = partitioned_series.rename(columns={'static_1': 'product_id'})
dynamic_series = dynamic_series
series_with_prices = dynamic_series.merge(prices_catalog, how='left')
series_with_prices.head()

Unnamed: 0,unique_id,ds,y,static_0,product_id,price
0,id_00,2000-10-05,3.981198,79,45,0.570826
1,id_00,2000-10-06,10.327401,79,45,0.260562
2,id_00,2000-10-07,17.657474,79,45,0.274048
3,id_00,2000-10-08,25.89879,79,45,0.433878
4,id_00,2000-10-09,34.49404,79,45,0.653738


This dataframe will be passed to `DistributedMLForecast.fit` (or `DistributedMLForecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
fcst = DistributedMLForecast(
    models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
series_with_prices = series_with_prices
fcst.fit(
    series_with_prices,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=['static_0', 'product_id'],
)

So in order to update the price in each timestep we just call `DistributedForecast.predict` with our forecast horizon and pass the prices catalog as a dynamic dataframe.

In [None]:
preds = fcst.predict(7, dynamic_dfs=[prices_catalog])
preds.compute()

Unnamed: 0,unique_id,ds,XGBForecast,LGBMForecast
0,id_00,2001-05-15,42.223095,42.709877
1,id_00,2001-05-16,50.528976,49.982335
2,id_00,2001-05-17,2.072457,1.954048
3,id_00,2001-05-18,10.141087,10.343824
4,id_00,2001-05-19,18.437445,18.491415
...,...,...,...,...
72,id_99,2001-05-17,43.931427,44.330152
73,id_99,2001-05-18,2.001306,2.101588
74,id_99,2001-05-19,8.775194,9.288952
75,id_99,2001-05-20,15.422138,15.439463


In [None]:
#| hide
# test we can compute cross validation with
# exougenous variables without adding extra information
# later a more robust test is performed
cv_with_ex = fcst.cross_validation(
    series_with_prices,
    window_size=7,
    n_windows=2,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=['static_0', 'product_id'],
).compute()

#### Custom predictions

If you want to do something like scaling the predictions you can define a function and pass it to `DistributedMLForecast.predict` as described in <a href="/forecast.html#custom-predictions">Custom predictions</a>.

### Cross validation
Refer to `MLForecast.cross_validation`.

In [None]:
show_doc(DistributedMLForecast.cross_validation)

---

### DistributedMLForecast.cross_validation

>      DistributedMLForecast.cross_validation (data:~AnyDataFrame,
>                                              n_windows:int, window_size:int,
>                                              id_col:str, time_col:str,
>                                              target_col:str,
>                                              step_size:Optional[int]=None, sta
>                                              tic_features:Optional[List[str]]=
>                                              None, dropna:bool=True,
>                                              keep_last_n:Optional[int]=None, b
>                                              efore_predict_callback:Optional[C
>                                              allable]=None, after_predict_call
>                                              back:Optional[Callable]=None)

Perform time series cross validation.
Creates `n_windows` splits where each window has `window_size` test periods,
trains the models, computes the predictions and merges the actuals.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | AnyDataFrame |  | Series data in long format. |
| n_windows | int |  | Number of windows to evaluate. |
| window_size | int |  | Number of test periods in each window. |
| id_col | str |  | Column that identifies each serie. If 'index' then the index is used. |
| time_col | str |  | Column that identifies each timestep, its values can be timestamps or integers. |
| target_col | str |  | Column that contains the target. |
| step_size | Optional | None | Step size between each cross validation window. If None it will be equal to `window_size`. |
| static_features | Optional | None | Names of the features that are static and will be repeated when forecasting. |
| dropna | bool | True | Drop rows with missing values produced by the transformations. |
| keep_last_n | Optional | None | Keep only these many records from each serie for the forecasting step. Can save time and memory if your features allow it. |
| before_predict_callback | Optional | None | Function to call on the features before computing the predictions.<br>    This function will take the input dataframe that will be passed to the model for predicting and should return a dataframe with the same structure.<br>    The series identifier is on the index. |
| after_predict_callback | Optional | None | Function to call on the predictions before updating the targets.<br>    This function will take a pandas Series with the predictions and should return another one with the same structure.<br>    The series identifier is on the index. |
| **Returns** | **Iterator** |  | **Predictions for each window with the series id, timestamp, target value and predictions from each model.** |

In [None]:
n_windows = 2
window_size = 14

cv_results = fcst.cross_validation(
    partitioned_series,
    n_windows,
    window_size,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
)
cv_results

In [None]:
#| hide
cv_results_no_refit = fcst.cross_validation(
    partitioned_series,
    n_windows,
    window_size,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    refit=False
)
cv_results_df = cv_results.compute()
cv_results_no_refit_df = cv_results_no_refit.compute()
# test we recover the same "metadata"
models = ['XGBForecast', 'LGBMForecast']
test_eq(
    cv_results_no_refit_df.drop(columns=models),
    cv_results_df.drop(columns=models)
)

TypeError: DistributedMLForecast.cross_validation() got an unexpected keyword argument 'refit'

We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = cv_results.compute().drop(columns='cutoff').groupby('ds').mean()
agg_results.head()

KeyError: "['cutoff'] not found in axis"

We can also compute the error for each model.

In [None]:
def mse_from_dask_dataframe(ddf):
    mses = {}
    for model_name in ddf.columns.drop(['ds', 'y', 'cutoff']):
        mses[model_name] = (ddf['y'] - ddf[model_name]).pow(2).mean()
    return client.gather(client.compute(mses))

{k: round(v, 2) for k, v in mse_from_dask_dataframe(cv_results).items()}

{'XGBForecast': 0.87, 'LGBMForecast': 0.9}

In [None]:
#|hide
def test_cross_validation(data=non_std_series, add_exogenous=False):
    n_windows = 2
    window_size = 14
    fcst = DistributedMLForecast(XGBForecast(random_state=0), lags=[7, 14])
    if add_exogenous:
        data.map_partitions(lambda x: x.assign(ex1=lambda y: np.arange(0, len(y))))
    backtest_results = fcst.cross_validation(
        data,
        n_windows,
        window_size,
        id_col='some_id',
        time_col='time',
        target_col='value',
        static_features=['static_0', 'static_1'],    
    ).compute()
    renamer = {'some_id': 'unique_id', 'time': 'ds', 'value': 'y'}
    backtest_results = backtest_results.rename(columns=renamer).set_index('unique_id')
    renamed = data.rename(columns=renamer).set_index('unique_id')
    cv_models = fcst.cv_models_
    manual_results = []
    for i, (cutoff, train, valid) in enumerate(backtest_splits(renamed, n_windows, window_size, 1)):
        fcst.preprocess(train)
        fcst.models_ = cv_models[i]
        if add_exogenous:
            dynamic_dfs = [valid.drop(columns=['y', 'static_0', 'static_1']).reset_index().compute()]
        else:
            dynamic_dfs = None
        pred = fcst.predict(window_size, dynamic_dfs=dynamic_dfs).compute()
        res = valid[['ds', 'y']].compute()
        res['cutoff'] = cutoff
        res = res.merge(pred, on=['unique_id', 'ds'], how='left')
        manual_results.append(res)
    manual_results = pd.concat(manual_results)
    pd.testing.assert_frame_equal(backtest_results, manual_results)
test_cross_validation()
test_cross_validation(add_exogenous=True)

In [None]:
client.close()

## Spark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = (
    SparkSession.builder.appName("MyApp")
    .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.10.2")
    .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
    .getOrCreate()
)
from synapse.ml.lightgbm import LightGBMRegressor as SynapseLGBMRegressor

In [None]:
spark_series = spark.createDataFrame(series).repartitionByRange(4, 'unique_id')

In [None]:
from xgboost.spark import SparkXGBRegressor

In [None]:
fcst = DistributedMLForecast(
    [
        SynapseLGBMRegressor(),
        SparkXGBRegressor()
    ],
    freq='D',
    lags=[1],
    lag_transforms={
        1: [expanding_mean]
    },
    date_features=['dayofweek'],
)

In [None]:
fcst.fit(
    spark_series,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=['static_0', 'product_id'],
)
fcst.predict(14, dynamic_dfs=[prices]).toPandas()

[21:02:53] task 0 got new rank 0                                    (0 + 1) / 1]
                                                                                

Unnamed: 0,unique_id,ds,LightGBMRegressor,SparkXGBRegressor
0,id_00,2001-05-15,42.227353,42.170174
1,id_00,2001-05-16,49.678239,49.665058
2,id_00,2001-05-17,1.425428,2.123648
3,id_00,2001-05-18,10.021719,10.150534
4,id_00,2001-05-19,18.167309,18.219870
...,...,...,...,...
1395,id_99,2001-05-24,43.879492,43.089096
1396,id_99,2001-05-25,1.592245,1.300191
1397,id_99,2001-05-26,8.765065,8.563540
1398,id_99,2001-05-27,15.720301,15.627042


In [None]:
cv_res = fcst.cross_validation(spark_series, n_windows=2, window_size=14, id_col='unique_id', time_col='ds', target_col='y')
res1 = next(cv_res)
res1.toPandas()

[21:03:12] task 0 got new rank 0                                    (0 + 1) / 1]
                                                                                

Unnamed: 0,unique_id,ds,LightGBMRegressor,SparkXGBRegressor
0,id_00,2001-04-17,41.397289,41.412704
1,id_00,2001-04-18,50.008758,49.848236
2,id_00,2001-04-19,1.868972,1.531837
3,id_00,2001-04-20,10.266771,9.750021
4,id_00,2001-04-21,18.296489,17.535915
...,...,...,...,...
1395,id_99,2001-04-26,43.910674,43.634216
1396,id_99,2001-04-27,1.955158,1.962174
1397,id_99,2001-04-28,8.864172,8.601549
1398,id_99,2001-04-29,15.983674,15.883894


In [None]:
res2 = next(cv_res)
res2.toPandas()

[21:03:31] task 0 got new rank 0                                    (0 + 1) / 1]
                                                                                

Unnamed: 0,unique_id,ds,LightGBMRegressor,SparkXGBRegressor
0,id_00,2001-05-01,41.510874,41.991829
1,id_00,2001-05-02,49.580176,49.946548
2,id_00,2001-05-03,1.699062,1.733775
3,id_00,2001-05-04,10.143278,9.953071
4,id_00,2001-05-05,18.001024,17.789112
...,...,...,...,...
1395,id_99,2001-05-10,43.846853,43.703159
1396,id_99,2001-05-11,1.832246,1.999386
1397,id_99,2001-05-12,8.805302,8.618651
1398,id_99,2001-05-13,15.946444,15.728466


## Dask

In [None]:
from dask.distributed import Client

from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast

In [None]:
client = Client(n_workers=2)

In [None]:
dask_series = (
    dd
    .from_pandas(series.set_index('unique_id'), npartitions=4)  # make sure we split by the series identifier
    .map_partitions(lambda df: df.reset_index())
)

In [None]:
fcst = DistributedMLForecast(
    [LGBMForecast(), XGBForecast()],
    freq='D',
    lags=[1],
    lag_transforms={
        1: [expanding_mean]
    },
    date_features=['dayofweek'],
    engine=client,
)
_ = fcst.fit(dask_series, id_col='unique_id', time_col='ds', target_col='y', static_features=['static_0', 'product_id'])

In [None]:
fcst.predict(14, dynamic_dfs=[prices]).compute()

Unnamed: 0,unique_id,ds,LGBMForecast,XGBForecast
0,id_00,2001-05-15,42.328150,42.544678
1,id_00,2001-05-16,49.966854,50.006393
2,id_00,2001-05-17,1.614102,2.404574
3,id_00,2001-05-18,10.246828,10.144515
4,id_00,2001-05-19,18.183167,17.768204
...,...,...,...,...
345,id_99,2001-05-24,42.986614,43.913208
346,id_99,2001-05-25,1.536043,1.769517
347,id_99,2001-05-26,8.651628,8.689207
348,id_99,2001-05-27,15.467835,15.858318


In [None]:
cv_res = fcst.cross_validation(dask_series, n_windows=2, window_size=14, id_col='unique_id', time_col='ds', target_col='y')
res1 = next(cv_res)

In [None]:
res1.compute()

Unnamed: 0,unique_id,ds,LGBMForecast,XGBForecast
0,id_00,2001-04-17,41.101257,41.509098
1,id_00,2001-04-18,49.439778,49.994793
2,id_00,2001-04-19,2.209694,1.885389
3,id_00,2001-04-20,10.016894,9.791873
4,id_00,2001-04-21,18.006730,17.518440
...,...,...,...,...
345,id_99,2001-04-26,44.320700,42.248009
346,id_99,2001-04-27,2.230125,2.526252
347,id_99,2001-04-28,8.579381,8.485373
348,id_99,2001-04-29,15.496969,15.847349


In [None]:
res2 = next(cv_res)

In [None]:
res2.compute()

Unnamed: 0,unique_id,ds,LGBMForecast,XGBForecast
0,id_00,2001-05-01,42.610164,41.107208
1,id_00,2001-05-02,50.179124,50.424072
2,id_00,2001-05-03,1.690276,1.991987
3,id_00,2001-05-04,10.159849,9.895548
4,id_00,2001-05-05,18.321141,17.538578
...,...,...,...,...
345,id_99,2001-05-10,42.920872,43.232620
346,id_99,2001-05-11,1.932801,1.821584
347,id_99,2001-05-12,8.724589,8.611847
348,id_99,2001-05-13,15.355793,15.651299
