In [None]:
#all_distributed

In [None]:
#default_exp distributed.forecast

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
import warnings

warnings.simplefilter('ignore', UserWarning)

# Distributed forecast

> Distributed pipeline encapsulation

## This interface is only available on Linux

In [None]:
#export
from typing import Callable, Dict, Optional

import dask.dataframe as dd
from dask.distributed import Client, default_client

from mlforecast.core import predictions_flow, preprocessing_flow
from mlforecast.forecast import Forecast
from mlforecast.distributed.core import distributed_preprocess


In [None]:
#hide
from nbdev import show_doc

In [None]:
#export
class DistributedForecast(Forecast):
    """Full pipeline encapsulation.
    
    Takes a model (`LGBMForecast` or `XGBForecast`), a flow configuration and a client."""
    
    def __init__(self, model, flow_config: Dict, client: Optional[Client] = None):
        self.model = model
        self.flow_config = flow_config
        self.client = client or default_client()
        self.model.client = self.client
        
    def __repr__(self) -> str:
        return f'DistributedForecast(model={self.model}, flow_config={self.flow_config})'        

    def preprocess(self,
                   data: dd.DataFrame,
                   prep_fn: Callable = preprocessing_flow) -> dd.DataFrame:
        """Applies `prep_fn(partition, **self.flow_config)` on each partition of `data`.

        Saves the resulting `TimeSeries` objects as well as the divisions in `data` for the forecasting step.
        Returns a dask dataframe with the computed features."""
        self.data_divisions = data.divisions
        self.ts, series_ddf = distributed_preprocess(data, self.flow_config, self.client, prep_fn)
        return series_ddf

    def fit(self,
            data: dd.DataFrame,
            prep_fn: Callable = preprocessing_flow,
            **fit_kwargs) -> 'DistributedForecast':
        """Perform the preprocessing and fit the model."""
        train_ddf = self.preprocess(data, prep_fn)
        X, y = train_ddf.drop(columns=['ds', 'y']), train_ddf.y
        self.model.fit(X, y, **fit_kwargs)
        return self

    def predict(self,
                horizon: int,
                predict_fn: Callable = predictions_flow,
                **predict_fn_kwargs) -> dd.DataFrame:
        """Compute the predictions for the next `horizon` steps using `predict_fn`."""
        model_future = self.client.scatter(self.model.model_, broadcast=True)
        predictions_futures = self.client.map(predict_fn,
                                              self.ts,
                                              model=model_future,
                                              horizon=horizon,
                                              **predict_fn_kwargs)
        meta = self.client.submit(lambda x: x.head(), predictions_futures[0]).result()
        return dd.from_delayed(predictions_futures, meta=meta, divisions=self.data_divisions)

The `DistributedForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

## Example
This shows an example with simulated data, for a real world example in a remote cluster you can check the [M5 distributed example](https://www.kaggle.com/lemuz90/m5-mlforecast-distributed).

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean, rolling_std

from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast
from mlforecast.utils import generate_daily_series

The different things that you need to use `DistributedForecast` (as opposed to `Forecast`) are:
1. You need to set up a `dask.distributed.Client`. If this client is connected to a remote cluster then the process will run there.
2. Your data needs to be a `dask.dataframe`.
3. You need to use a model that implements distributed training.

### Client setup

Here we define a client that connects to a `dask.distributed.LocalCluster`, however it could be any other kind of cluster.

In [None]:
client = Client(n_workers=2)

### Data setup

The data is given as a `dask.dataframe`, you need to make sure that each time serie is only in one partition and it is recommended that you have as many partitions as you have workers.

The required input format is the same as for `Forecast`, except that it's a `dask.dataframe` instead of a `pandas.dataframe`.

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True)
for col in series.select_dtypes(include='category'):
    series[col] = series[col].cat.codes  # encode categories for xgboost
partitioned_series = dd.from_pandas(series, npartitions=2)
partitioned_series

### Model
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `LGBMForecast` and `XGBForecast` which are just wrappers around `lightgbm.dask.DaskLGBMRegressor` and `xgboost.dask.DaskXGBRegressor` that add a `model_` property to get the trained model from them and send it to every worker to perform the predictions step.

In [None]:
model = XGBForecast()

### Flow configuration

As in the local version (`Forecast`) a flow configuration is required. This is passed as a dictionary that will be unpacked in the call to our preprocessing function, which is`preprocessing_flow` by default.

In [None]:
preprocessing_flow?

Here where we say that:
1. Our series have daily frequency.
2. We want to use lag 7 and lag 14 as features.
3. We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 7
   * rolling standard deviation of the lag 7 over a window of size 7
4. We want to use dayofweek, month and year as date features.
5. We want to perform the preprocessing and the forecasting steps using 2 threads.

In [None]:
flow_config = dict(
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 7)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=2,
)

### Training
Once we have our model and flow configuration we instantiate a `DistributedForecast` with them.

In [None]:
fcst = DistributedForecast(model, flow_config)

From this point we have two options:

1. Preprocess the data and fit our model using all of it.
2. Preprocess the data and get it back as a dataframe to do some custom splitting or adding additional features. And then training the model.

#### 1. Using all the data

In [None]:
show_doc(DistributedForecast.fit)

Calling `.fit` on our data computes the features independently for each partition and performs distributed training.

In [None]:
fcst.fit(partitioned_series)

#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `.preprocess` on our data.

In [None]:
show_doc(DistributedForecast.preprocess)

In [None]:
features_ddf = fcst.preprocess(partitioned_series)
features_ddf.head()

This is useful if we want to inspect the data the model will be trained, adding additional features or performing some custom train-valid split. Here we perform a 80-20 split.

In [None]:
rng = np.random.RandomState(0)

def mask_as_series(df):
    return pd.Series(rng.rand(df.shape[0]) < 0.8, index=df.index)

train_mask = features_ddf.map_partitions(mask_as_series)
train, valid = features_ddf[train_mask], features_ddf[~train_mask]
X_train, y_train = train.drop(columns=['ds', 'y']), train.y
X_valid, y_valid = valid.drop(columns=['ds', 'y']), valid.y

If we do this we must "manually" train our model calling `DistributedForecast.model.fit`.

In [None]:
fcst.model.fit(X_train, y_train, 
               eval_set=[(X_train, y_train), (X_valid, y_valid)],
               verbose=0)

In [None]:
for lab in fcst.model.evals_result_.keys():
    plt.plot(fcst.model.evals_result_[lab]['rmse'], label=lab)
plt.legend();

### Forecasting
Once we have our fitted model we can compute the predictions for the next 7 timesteps.

In [None]:
show_doc(DistributedForecast.predict)

In [None]:
preds = fcst.predict(7)
preds

### Backtesting

If we would like to know how good our forecast will be for a specific model and set of features then we can perform backtesting. What backtesting does is take our data and split it in two parts, where the first part is used for training and the second one for validation. Since the data is time dependant we usually take the last *x* observations from our data as the validation set.

This process is implemented in `Forecast.backtest` (and inherited by `DistributedForecast`), which takes our data and performs the process described above for `n_windows` times where each window is of size `window_size`. For example, if we have 100 samples and we want to perform 2 backtests each of size 14, the splits will be as follows:

1. Train: 1 to 72. Validation: 73 to 86.
2. Train: 1 to 86. Validation: 87 to 100.

In [None]:
show_doc(DistributedForecast.backtest)

In [None]:
n_windows = 2
window_size = 14

fcst = DistributedForecast(model, flow_config)
backtest_results = fcst.backtest(partitioned_series, n_windows, window_size)

This returns a generator that yields the results of each window one at a time.

In [None]:
window1_result = next(backtest_results)
window1_result

In [None]:
window2_result = next(backtest_results)
results = pd.concat([window1_result.compute(), window2_result.compute()])

We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = results.groupby('ds').sum()
agg_results.plot();

We can include some more context by using the values in the training set.

In [None]:
history = series[series.ds < agg_results.index.min()]
agg_history = history.groupby('ds')[['y']].sum().tail(50)
agg_history.append(agg_results).plot();

Note that since the backtest results are returned as a generator we can also compute a single statistic on them and not keep the whole results in memory.

In [None]:
def mse_from_dask_dataframe(ddf):
    ddf['sq_err'] = (ddf['y'] - ddf['y_pred'])**2
    mse = ddf['sq_err'].mean()
    return mse.compute()

In [None]:
fcst = DistributedForecast(XGBForecast(), flow_config)
backtest_results = fcst.backtest(partitioned_series, n_windows, window_size)

losses = [mse_from_dask_dataframe(res) for res in backtest_results]
np.round(losses, 2)

We can try `LGBMForecast` as well

In [None]:
fcst = DistributedForecast(LGBMForecast(), flow_config)
backtest_results = fcst.backtest(partitioned_series, n_windows, window_size)

losses = [mse_from_dask_dataframe(res) for res in backtest_results]
np.round(losses, 2)