In [None]:
#all_distributed
#default_exp distributed.forecast

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
import warnings

from nbdev import show_doc

warnings.simplefilter('ignore', UserWarning)

# Distributed forecast

> Distributed pipeline encapsulation

**This interface is only available on Linux**

In [None]:
#export
from typing import Callable, Generator, List, Optional

import dask.dataframe as dd
import pandas as pd
from dask.distributed import Client, default_client

from mlforecast.core import TimeSeries, simple_predict
from mlforecast.utils import backtest_splits
from mlforecast.distributed.core import DistributedTimeSeries


In [None]:
#export
class DistributedForecast:
    """Distributed pipeline encapsulation."""
    
    def __init__(self, model, ts: TimeSeries, client: Optional[Client] = None):
        self.model = model
        self.client = client or default_client()
        self.dts = DistributedTimeSeries(ts, self.client)
        self.model.client = self.client
        
    def __repr__(self) -> str:
        return f'DistributedForecast(model={self.model}, dts={self.dts})'

    def preprocess(
        self,
        data: dd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,        
    ) -> dd.DataFrame:
        """Computes the transformations on each partition of `data`.

        Saves the resulting `TimeSeries` objects as well as the divisions in `data` for the forecasting step.
        Returns a dask dataframe with the computed features."""
        return self.dts.fit_transform(data, static_features, dropna, keep_last_n)

    def fit(
        self,
        data: dd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,            
        **fit_kwargs,
    ) -> 'DistributedForecast':
        """Perform the preprocessing and fit the model."""
        train_ddf = self.preprocess(data, static_features, dropna, keep_last_n)
        X, y = train_ddf.drop(columns=['ds', 'y']), train_ddf.y
        self.model.fit(X, y, **fit_kwargs)
        return self

    def predict(
        self,
        horizon: int,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        predict_fn: Optional[Callable] = None,
        **predict_fn_kwargs,
    ) -> dd.DataFrame:
        """Compute the predictions for the next `horizon` steps.
        
        `predict_fn(model, new_x, features_order, **predict_fn_kwargs)` is called in every timestep, where:
        `model` is the trained model.
        `new_x` is a dataframe with the same format as the input plus the computed features.
        `features_order` is the list of column names that were used in the training step.
        """
        return self.dts.predict(
            self.model.model_, horizon, dynamic_dfs, predict_fn, **predict_fn_kwargs
        )

    def backtest(
        self,
        data: dd.DataFrame,
        n_windows: int,
        window_size: int,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        predict_fn: Callable = simple_predict,
        **predict_fn_kwargs,
    ) -> Generator[dd.DataFrame, None, None]:
        """Creates `n_windows` splits of `window_size` from `data`, trains the model
        on the training set, predicts the window and merges the actuals and the predictions
        in a dataframe.

        Returns a generator to the dataframes containing the datestamps, actual values 
        and predictions."""
        for train, valid in backtest_splits(data, n_windows, window_size):
            self.fit(train, static_features, dropna, keep_last_n)
            y_pred = self.predict(
                window_size, dynamic_dfs, predict_fn, **predict_fn_kwargs
            )
            y_valid = valid[['ds', 'y']]
            result = y_valid.merge(y_pred, on=['unique_id', 'ds'], how='left')
            yield result


The `DistributedForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

## Example
This shows an example with simulated data, for a real world example in a remote cluster you can check the [M5 distributed example](https://www.kaggle.com/lemuz90/m5-mlforecast-distributed).

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import generate_daily_series, generate_prices_for_series
from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast

The different things that you need to use `DistributedForecast` (as opposed to `Forecast`) are:
1. You need to set up a `dask.distributed.Client`. If this client is connected to a remote cluster then the process will run there.
2. Your data needs to be a `dask.dataframe.DataFrame`.
3. You need to use a model that implements distributed training.

### Client setup

Here we define a client that connects to a `dask.distributed.LocalCluster`, however it could be any other kind of cluster.

In [None]:
client = Client(n_workers=2)

### Data setup

The data is given as a `dask.dataframe.DataFrame`, you need to make sure that each time serie is only in one partition and it is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` in `TimeSeries` to avoid having nested parallelism.

The required input format is the same as for `Forecast`, except that it's a `dask.dataframe.DataFrame` instead of a `pandas.Dataframe`.

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True)
for col in series.select_dtypes(include='category'):
    series[col] = series[col].cat.codes  # encode categories for xgboost
partitioned_series = dd.from_pandas(series, npartitions=10)
partitioned_series

### Model
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `LGBMForecast` and `XGBForecast` which are just wrappers around `lightgbm.dask.DaskLGBMRegressor` and `xgboost.dask.DaskXGBRegressor` that add a `model_` property to get the trained model from them and send it to every worker to perform the predictions step.

In [None]:
model = XGBForecast()

### TimeSeries

As in the local version (`Forecast`) a `TimeSeries` object is required.

In [None]:
ts = TimeSeries(
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,    
)
ts

Here where we say that:
* Our series have daily frequency.
* We want to use lag 7 as a feature
* We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 14
* We want to use dayofweek and month as date features.
* We want to perform the preprocessing and the forecasting steps using 1 thread, because we have 10 partitions and 2 workers.

### Training
Once we have our model and time series we instantiate a `DistributedForecast` with them.

In [None]:
fcst = DistributedForecast(model, ts)

From this point we have two options:

1. Preprocess the data and fit our model using all of it.
2. Preprocess the data and get it back as a dataframe to do some custom splitting or adding additional features. And then training the model.

#### 1. Using all the data

In [None]:
show_doc(DistributedForecast.fit)

Calling `.fit` on our data computes the features independently for each partition and performs distributed training.

In [None]:
fcst.fit(partitioned_series)

#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `.preprocess` on our data.

In [None]:
show_doc(DistributedForecast.preprocess)

In [None]:
features_ddf = fcst.preprocess(partitioned_series)
features_ddf.head()

This is useful if we want to inspect the data the model will be trained, adding additional features or performing some custom train-valid split. Here we perform a 80-20 split.

In [None]:
rng = np.random.RandomState(0)

def mask_as_series(df):
    return pd.Series(rng.rand(df.shape[0]) < 0.8, index=df.index)

train_mask = features_ddf.map_partitions(mask_as_series)
train, valid = features_ddf[train_mask], features_ddf[~train_mask]
X_train, y_train = train.drop(columns=['ds', 'y']), train.y
X_valid, y_valid = valid.drop(columns=['ds', 'y']), valid.y

If we do this we must "manually" train our model calling `DistributedForecast.model.fit`.

In [None]:
fcst.model.fit(X_train, y_train,
               eval_set=[(X_train, y_train), (X_valid, y_valid)],
               verbose=0)

In [None]:
for lab in fcst.model.evals_result_.keys():
    plt.plot(fcst.model.evals_result_[lab]['rmse'], label=lab)
plt.legend();

### Forecasting

In [None]:
show_doc(DistributedForecast.predict)

Once we have our fitted model we can compute the predictions for the next 7 timesteps.

In [None]:
preds = fcst.predict(7)
preds

In [None]:
#hide
preds = preds.compute()
preds2 = fcst.predict(7).compute()
np.testing.assert_equal(preds['y_pred'].values, preds2['y_pred'].values)

#### Dynamic features

By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can pass them as a list to the `dynamic_dfs` argument of `Forecast.predict`, which will call `pd.DataFrame.merge` on each of them in order.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
prices_catalog

And you have already merged these prices into your series dataframe.

In [None]:
dynamic_series = partitioned_series.rename(columns={'static_1': 'product_id'})
dynamic_series = dynamic_series.reset_index()
series_with_prices = dynamic_series.merge(prices_catalog, how='left')
series_with_prices = series_with_prices.set_index('unique_id', sorted=True)
series_with_prices.head()

This dataframe will be passed to `DistributedForecast.fit` (or `DistributedForecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
fcst = DistributedForecast(XGBForecast(), ts)
fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])

So in order to update the price in each timestep we just call `DistributedForecast.predict` with our forecast horizon and pass the prices catalog as a dynamic dataframe.

In [None]:
preds = fcst.predict(7, dynamic_dfs=[prices_catalog])
preds.compute()

#### Custom predictions

If you want to do something like scaling the predictions you can define a function and pass it to `DistributedForecast.predict` as described in <a href="/mlforecast/forecast.html#Custom-predictions">Custom predictions</a>.

### Backtesting

If we would like to know how good our forecast will be for a specific model and set of features then we can perform backtesting. What backtesting does is take our data and split it in two parts, where the first part is used for training and the second one for validation. Since the data is time dependant we usually take the last *x* observations from our data as the validation set.

This process is implemented in `DistributedForecast.backtest`, which takes our data and performs the process described above for `n_windows` times where each window is of size `window_size`. For example, if we have 100 samples and we want to perform 2 backtests each of size 14, the splits will be as follows:

1. Train: 1 to 72. Validation: 73 to 86.
2. Train: 1 to 86. Validation: 87 to 100.

In [None]:
show_doc(DistributedForecast.backtest)

In [None]:
n_windows = 2
window_size = 14

fcst = DistributedForecast(model, ts)
backtest_results = fcst.backtest(partitioned_series, n_windows, window_size)

This returns a generator that yields the results of each window one at a time.

In [None]:
window1_result = next(backtest_results)
window1_result

In [None]:
window2_result = next(backtest_results)
results = pd.concat([window1_result.compute(), window2_result.compute()])

We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = results.groupby('ds').sum()
agg_results.plot();

We can include some more context by using the values in the training set.

In [None]:
history = series[series.ds < agg_results.index.min()]
agg_history = history.groupby('ds')[['y']].sum().tail(50)
agg_history.append(agg_results).plot();

Note that since the backtest results are returned as a generator we can also compute a single statistic on them and not keep the whole results in memory.

In [None]:
def mse_from_dask_dataframe(ddf):
    ddf['sq_err'] = (ddf['y'] - ddf['y_pred'])**2
    mse = ddf['sq_err'].mean()
    return mse.compute()

In [None]:
fcst = DistributedForecast(XGBForecast(), ts)
backtest_results = fcst.backtest(partitioned_series, n_windows, window_size)

losses = [mse_from_dask_dataframe(res) for res in backtest_results]
np.round(losses, 2)

We can try `LGBMForecast` as well

In [None]:
fcst = DistributedForecast(LGBMForecast(), ts)
backtest_results = fcst.backtest(partitioned_series, n_windows, window_size)

losses = [mse_from_dask_dataframe(res) for res in backtest_results]
np.round(losses, 2)

In [None]:
#hide
model = XGBForecast()
ts = TimeSeries(lags=[7, 14])
fcst = DistributedForecast(model, ts)
backtest_results = [res.compute() for res in fcst.backtest(partitioned_series, n_windows, window_size)]
manual_results = []
for train, valid in backtest_splits(partitioned_series, n_windows, window_size):
    model = XGBForecast()
    ts = TimeSeries(lags=[7, 14], num_threads=1)
    fcst = DistributedForecast(model, ts).fit(train)
    pred = fcst.predict(window_size).compute()
    res = valid[['ds', 'y']].compute()
    manual_results.append(res.merge(pred, on=['unique_id', 'ds'], how='left'))
backtest_results = pd.concat(backtest_results)
manual_results = pd.concat(manual_results)
pd.testing.assert_frame_equal(backtest_results, manual_results)

In [None]:
client.close()