In [None]:
#|default_exp distributed.forecast

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

In [None]:
#|hide
import warnings

from nbdev import show_doc
from sklearn import set_config

In [None]:
#|hide
set_config(display='text')
warnings.filterwarnings('ignore')

# Distributed forecast

> Distributed pipeline encapsulation

**This interface is only tested on Linux**

In [None]:
#|export
from typing import Callable, Dict, List, Optional, Tuple

import dask.dataframe as dd
import pandas as pd
from dask.distributed import Client, default_client

from mlforecast.core import TimeSeries
from mlforecast.forecast import Forecast
from mlforecast.distributed.core import DistributedTimeSeries

In [None]:
#|export
class DistributedForecast(Forecast):
    """Distributed pipeline encapsulation."""
    
    def __init__(
        self,
        models,  # model or list of mlforecast.distributed.models
        freq: str,  # pandas offset alias, e.g. D, W, M
        lags: List[int] = [],  # list of lags to use as features
        lag_transforms: Dict[int, List[Tuple]] = {},  # list of transformations to apply to each lag
        date_features: List[str] = [],  # list of names of pandas date attributes to use as features, e.g. dayofweek
        num_threads: int = 1,  # number of threads to use when computing lag features
        client: Optional[Client] = None  # dask client to use for computations
    ):
        if not isinstance(models, list):
            models = [models]
        self.models = models
        self.client = client or default_client()
        self.dts = DistributedTimeSeries(
            TimeSeries(freq, lags, lag_transforms, date_features, num_threads),
            self.client,
        )
        for model in self.models:
            model.client = self.client
        
    def __repr__(self) -> str:
        return (
            f'DistributedForecast(models=[{", ".join(m.__class__.__name__ for m in self.models)}], '
            f'freq={self.freq}, '
            f'lag_features={list(self.dts._base_ts.transforms.keys())}, '
            f'date_features={self.dts._base_ts.date_features}, '
            f'num_threads={self.dts._base_ts.num_threads}, '
            f'client={self.client})'
        )
    
    @property
    def freq(self):
        return self.dts._base_ts.freq

    def preprocess(
        self,
        data: dd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,        
    ) -> dd.DataFrame:
        """Computes the transformations on each partition of `data` and
        saves the required information for the forecasting step.
        Returns a dask dataframe with the computed features."""
        return self.dts.fit_transform(data, static_features, dropna, keep_last_n)

    def fit(
        self,
        data: dd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None) -> 'DistributedForecast':
        """Perform the preprocessing and fit the model."""
        train_ddf = self.preprocess(data, static_features, dropna, keep_last_n)
        X, y = train_ddf.drop(columns=['ds', 'y']), train_ddf.y
        for model in self.models:
            model.fit(X, y)
        return self

    def predict(
        self,
        horizon: int,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        predict_fn: Optional[Callable] = None,
        **predict_fn_kwargs,
    ) -> dd.DataFrame:
        return self.dts.predict(
            [m.model_ for m in self.models], horizon, dynamic_dfs, predict_fn, **predict_fn_kwargs
        )
    
    predict.__doc__ = Forecast.predict.__doc__

The `DistributedForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

## Example
This shows an example with simulated data, for a real world example in a remote cluster you can check the [M5 distributed example](https://www.kaggle.com/lemuz90/m5-mlforecast-distributed).

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import backtest_splits, generate_daily_series, generate_prices_for_series
from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast

The different things that you need to use `DistributedForecast` (as opposed to `Forecast`) are:
1. You need to set up a `dask.distributed.Client`. If this client is connected to a remote cluster then the process will run there.
2. Your data needs to be a `dask.dataframe.DataFrame`.
3. You need to use a model that implements distributed training.

### Client setup

Here we define a client that connects to a `dask.distributed.LocalCluster`, however it could be any other kind of cluster.

In [None]:
client = Client(n_workers=2, threads_per_worker=1)

### Data setup

The data is given as a `dask.dataframe.DataFrame`, you need to make sure that each time serie is only in one partition and it is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` in `TimeSeries` to avoid having nested parallelism.

The required input format is the same as for `Forecast`, except that it's a `dask.dataframe.DataFrame` instead of a `pandas.Dataframe`.

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True, static_as_categorical=False)
partitioned_series = dd.from_pandas(series, npartitions=2)
partitioned_series

Unnamed: 0_level_0,ds,y,static_0,static_1
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id_00,datetime64[ns],float64,int64,int64
id_50,...,...,...,...
id_99,...,...,...,...


### Models
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `LGBMForecast` and `XGBForecast` which are just wrappers around `lightgbm.dask.DaskLGBMRegressor` and `xgboost.dask.DaskXGBRegressor` that add a `model_` property to get the trained model from them and send it to every worker to perform the predictions step.

In [None]:
models = [LGBMForecast(random_state=0), XGBForecast(random_state=0)]

### Training
Once we have our model and time series we instantiate a `DistributedForecast` with them.

In [None]:
fcst = DistributedForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
fcst

DistributedForecast(models=[LGBMForecast, XGBForecast], freq=<Day>, lag_features=['lag-7', 'expanding_mean_lag-1', 'rolling_mean_lag-7_window_size-14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:43145' processes=2 threads=2, memory=15.50 GiB>)

Here where we say that:

* Our series have daily frequency.
* We want to use lag 7 as a feature
* We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 14
* We want to use dayofweek and month as date features.
* We want to perform the preprocessing and the forecasting steps using 1 thread, because we have 10 partitions and 2 workers.

From this point we have two options:

1. Preprocess the data and fit our models using all of it.
2. Preprocess the data and get it back as a dataframe to do some custom splitting or adding additional features. And then training the model.

#### 1. Using all the data

In [None]:
show_doc(DistributedForecast.fit)

---

### DistributedForecast.fit

>      DistributedForecast.fit (data:dask.dataframe.core.DataFrame,
>                               static_features:Optional[List[str]]=None,
>                               dropna:bool=True,
>                               keep_last_n:Optional[int]=None)

Perform the preprocessing and fit the model.

Calling `.fit` on our data computes the features independently for each partition and performs distributed training.

In [None]:
fcst.fit(partitioned_series)

Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 57043...
[LightGBM] [Info] Binding port 57043 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 44049...
[LightGBM] [Info] Binding port 44049 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Local rank: 1, total number of machines: 2


[21:09:29] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 0
[21:09:29] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 1


DistributedForecast(models=[LGBMForecast, XGBForecast], freq=<Day>, lag_features=['lag-7', 'expanding_mean_lag-1', 'rolling_mean_lag-7_window_size-14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:43145' processes=2 threads=2, memory=15.50 GiB>)

### Forecasting

In [None]:
show_doc(DistributedForecast.predict)

---

### DistributedForecast.predict

>      DistributedForecast.predict (horizon:int,
>                                   dynamic_dfs:Optional[List[pandas.core.frame.
>                                   DataFrame]]=None,
>                                   predict_fn:Optional[Callable]=None,
>                                   **predict_fn_kwargs)

Compute the predictions for the next `horizon` steps.

`predict_fn(model, new_x, dynamic_dfs, features_order, **predict_fn_kwargs)` is called in every timestep, where:
`model` is the trained model.
`new_x` is a dataframe with the same format as the input plus the computed features.
`dynamic_dfs` is a list containing the dynamic dataframes.
`features_order` is the list of column names that were used in the training step.

Once we have our fitted model we can compute the predictions for the next 7 timesteps.

In [None]:
preds = fcst.predict(7)
preds

Unnamed: 0_level_0,ds,LGBMRegressor,XGBRegressor
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_00,datetime64[ns],float64,float32
id_50,...,...,...
id_99,...,...,...


In [None]:
#|hide
preds = preds.compute()
preds2 = fcst.predict(7).compute()
pd.testing.assert_frame_equal(preds, preds2)

#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `.preprocess` on our data.

In [None]:
show_doc(DistributedForecast.preprocess)

---

### DistributedForecast.preprocess

>      DistributedForecast.preprocess (data:dask.dataframe.core.DataFrame,
>                                      static_features:Optional[List[str]]=None,
>                                      dropna:bool=True,
>                                      keep_last_n:Optional[int]=None)

Computes the transformations on each partition of `data` and
saves the required information for the forecasting step.
Returns a dask dataframe with the computed features.

In [None]:
features_ddf = fcst.preprocess(partitioned_series)
features_ddf.head()

Unnamed: 0_level_0,ds,y,static_0,static_1,lag-7,expanding_mean_lag-1,rolling_mean_lag-7_window_size-14,dayofweek,month
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
id_00,2000-10-25,497.668437,79,45,506.946385,250.013666,263.200596,2,10
id_00,2000-10-26,39.183469,79,45,38.8778,261.80675,263.133868,3,10
id_00,2000-10-27,94.377779,79,45,115.127739,251.68751,263.980563,4,10
id_00,2000-10-28,179.235741,79,45,180.384975,244.847957,264.252723,5,10
id_00,2000-10-29,267.546447,79,45,242.228588,242.114114,263.055629,6,10


This is useful if we want to inspect the data the model will be trained, adding additional features or performing some custom train-valid split. Here we perform a 80-20 split.

In [None]:
rng = np.random.RandomState(0)

def mask_as_series(df):
    return pd.Series(rng.rand(df.shape[0]) < 0.8, index=df.index)

train_mask = features_ddf.map_partitions(mask_as_series)
train, valid = features_ddf[train_mask], features_ddf[~train_mask]
X_train, y_train = train.drop(columns=['ds', 'y']), train.y
X_valid, y_valid = valid.drop(columns=['ds', 'y']), valid.y

If we do this we must "manually" train our model calling `DistributedForecast.model.fit`.

In [None]:
models[1].fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_metric='rmse',
    verbose=False,
)

[21:09:31] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 0
[21:09:31] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 1


XGBForecast(base_score=0.5, booster='gbtree', colsample_bylevel=1,
            colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
            grow_policy='depthwise', interaction_constraints='',
            learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
            max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
            monotone_constraints='()', n_jobs=1, num_parallel_tree=1,
            objective='reg:squarederror', predictor='auto', random_state=0,
            reg_alpha=0, reg_lambda=1, sampling_method='uniform',
            scale_pos_weight=1, subsample=1, tree_method='approx',
            validate_parameters=1)

We can see the RMSE by iteration for each set.

In [None]:
pd.DataFrame({
    k: np.round(fcst.models[1].evals_result_[k]['rmse'], 2)
    for k in ('validation_0', 'validation_1')
})

Unnamed: 0,validation_0,validation_1
0,159.97,155.57
1,112.40,109.38
2,79.17,77.09
3,56.04,54.63
4,40.01,39.09
...,...,...
95,7.58,9.64
96,7.56,9.64
97,7.56,9.64
98,7.55,9.64


#### Dynamic features

By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can pass them as a list to the `dynamic_dfs` argument of `Forecast.predict`, which will call `pd.DataFrame.merge` on each of them in order.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
prices_catalog

Unnamed: 0,ds,product_id,price
0,2000-06-09,1,0.548814
1,2000-06-10,1,0.715189
2,2000-06-11,1,0.602763
3,2000-06-12,1,0.544883
4,2000-06-13,1,0.423655
...,...,...,...
20180,2001-05-17,99,0.223520
20181,2001-05-18,99,0.446104
20182,2001-05-19,99,0.044783
20183,2001-05-20,99,0.483216


And you have already merged these prices into your series dataframe.

In [None]:
dynamic_series = partitioned_series.rename(columns={'static_1': 'product_id'})
dynamic_series = dynamic_series.reset_index()
series_with_prices = dynamic_series.merge(prices_catalog, how='left')
series_with_prices = series_with_prices.set_index('unique_id', sorted=True)
series_with_prices.head()

Unnamed: 0_level_0,ds,y,static_0,product_id,price
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_00,2000-10-05,39.811983,79,45,0.570826
id_00,2000-10-06,103.274013,79,45,0.260562
id_00,2000-10-07,176.574744,79,45,0.274048
id_00,2000-10-08,258.9879,79,45,0.433878
id_00,2000-10-09,344.940404,79,45,0.653738


This dataframe will be passed to `DistributedForecast.fit` (or `DistributedForecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
fcst = DistributedForecast(
    models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
series_with_prices = series_with_prices
fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])

Finding random open ports for workers


[21:09:32] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 0


DistributedForecast(models=[LGBMForecast, XGBForecast], freq=<Day>, lag_features=['lag-7', 'expanding_mean_lag-1', 'rolling_mean_lag-7_window_size-14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:43145' processes=2 threads=2, memory=15.50 GiB>)

So in order to update the price in each timestep we just call `DistributedForecast.predict` with our forecast horizon and pass the prices catalog as a dynamic dataframe.

In [None]:
preds = fcst.predict(7, dynamic_dfs=[prices_catalog])
preds.compute()

Unnamed: 0_level_0,ds,LGBMRegressor,XGBRegressor
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_00,2001-05-15,425.271062,423.707642
id_00,2001-05-16,499.367199,501.180878
id_00,2001-05-17,19.680575,13.227302
id_00,2001-05-18,103.003798,98.732613
id_00,2001-05-19,186.687089,179.789963
...,...,...,...
id_99,2001-05-17,441.140547,437.574402
id_99,2001-05-18,19.506097,18.790911
id_99,2001-05-19,91.141189,87.097137
id_99,2001-05-20,151.241891,154.764557


#### Custom predictions

If you want to do something like scaling the predictions you can define a function and pass it to `DistributedForecast.predict` as described in <a href="/mlforecast/forecast.html#Custom-predictions">Custom predictions</a>.

### Cross validation
Refer to `Forecast.cross_validation`.

In [None]:
show_doc(DistributedForecast.cross_validation)

---

### Forecast.cross_validation

>      Forecast.cross_validation (data, n_windows:int, window_size:int,
>                                 static_features:Optional[List[str]]=None,
>                                 dropna:bool=True,
>                                 keep_last_n:Optional[int]=None, dynamic_dfs:Op
>                                 tional[List[pandas.core.frame.DataFrame]]=None
>                                 , predict_fn:Optional[Callable]=None,
>                                 **predict_fn_kwargs)

Creates `n_windows` splits of `window_size` from `data`, trains the model
on the training set, predicts the window and merges the actuals and the predictions
in a dataframe.

Returns a dataframe containing the datestamps, actual values, train ends and predictions.

In [None]:
n_windows = 2
window_size = 14

backtest_results = fcst.cross_validation(partitioned_series, n_windows, window_size)
backtest_results

Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 34615...
[LightGBM] [Info] Binding port 34615 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 50139...
[LightGBM] [Info] Binding port 50139 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


[21:09:34] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 0
[21:09:34] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 36165...
[LightGBM] [Info] Binding port 36165 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 45305...
[LightGBM] [Info] Binding port 45305 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


[21:09:35] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 0
[21:09:35] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 1


Unnamed: 0_level_0,ds,y,cutoff,LGBMRegressor,XGBRegressor
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],float64,datetime64[ns],float64,float32
,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = backtest_results.compute().groupby('ds').mean()
agg_results.head()

Unnamed: 0_level_0,y,LGBMRegressor,XGBRegressor
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-04-17,161.232312,162.078552,162.149048
2001-04-18,152.139197,151.35428,151.376801
2001-04-19,169.856989,171.306728,171.695221
2001-04-20,180.683402,180.020983,180.697113
2001-04-21,182.00609,181.230938,181.487701


We can include some more context by using the values in the training set.

In [None]:
history = series[series.ds < agg_results.index.min()]
agg_history = history.groupby('ds')[['y']].sum().tail(50)
pd.concat([agg_history, agg_results])

Unnamed: 0_level_0,y,LGBMRegressor,XGBRegressor
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-02-26,15560.913051,,
2001-02-27,15849.105590,,
2001-02-28,14695.554047,,
2001-03-01,17110.908354,,
2001-03-02,17845.327523,,
...,...,...,...
2001-05-10,171.394991,171.152135,171.039734
2001-05-11,180.413157,180.644497,180.548401
2001-05-12,181.652667,180.605739,180.978073
2001-05-13,168.138961,169.138480,168.937668


We can also compute the error for each model.

In [None]:
def mse_from_dask_dataframe(ddf):
    mses = {}
    for model_name in ddf.columns.drop(['ds', 'y', 'cutoff']):
        mses[model_name] = (ddf['y'] - ddf[model_name]).pow(2).mean()
    return client.gather(client.compute(mses))

{k: round(v, 2) for k, v in mse_from_dask_dataframe(backtest_results).items()}

{'LGBMRegressor': 92.11, 'XGBRegressor': 88.83}

In [None]:
#|hide
fcst = DistributedForecast(XGBForecast(random_state=0), freq='D', lags=[7, 14])
persisted_series = client.persist(partitioned_series)
backtest_results = fcst.cross_validation(persisted_series, n_windows, window_size).compute()
manual_results = []
for cutoff, train, valid in backtest_splits(persisted_series, n_windows, window_size, fcst.freq):
    fcst.fit(train)
    pred = fcst.predict(window_size).compute()
    res = valid[['ds', 'y']].compute()
    res['cutoff'] = cutoff
    manual_results.append(res.merge(pred, on=['unique_id', 'ds'], how='left'))
manual_results = pd.concat(manual_results)
pd.testing.assert_frame_equal(backtest_results, manual_results)

[21:09:36] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 0
[21:09:36] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 1
[21:09:37] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 0
[21:09:37] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 1
[21:09:38] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 0
[21:09:38] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 1
[21:09:38] task [xgboost.dask]:tcp://127.0.0.1:33681 got new rank 0
[21:09:38] task [xgboost.dask]:tcp://127.0.0.1:44639 got new rank 1


In [None]:
client.close()