In [None]:
#default_exp distributed.forecast

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#hide
import warnings

warnings.simplefilter('ignore', UserWarning)

# Distributed Forecast

> Distributed pipeline encapsulation

In [None]:
#export
from typing import Callable, Dict, Optional

import dask.dataframe as dd
from dask.distributed import Client, default_client

from mlforecast.core import preprocessing_flow
from mlforecast.distributed.core import distributed_preprocess

In [None]:
import numpy as np
from window_ops.rolling import *
from window_ops.expanding import *
from window_ops.ewm import ewm_mean

from mlforecast.distributed.models import LGBMForecast, XGBForecast
from mlforecast.utils import generate_daily_series

In [None]:
#export
class DistributedForecast:
    
    def __init__(self, model, flow_config: Dict, client: Optional[Client] = None):
        self.model = model
        self.flow_config = flow_config
        self.client = client or default_client()
        
    def preprocess(self, data: dd.DataFrame, prep_fn: Callable = preprocessing_flow) -> dd.DataFrame:
        self.data_divisions = data.divisions
        self.ts, train_ddf = distributed_preprocess(data, self.flow_config, self.client, prep_fn)
        return train_ddf
    
    def fit(self, data: dd.DataFrame, **kwargs) -> 'DistributedForecast':
        train_ddf = self.preprocess(data)
        X, y = train_ddf.drop(columns=['ds', 'y']), train_ddf.y
        self.model.fit(X, y, **kwargs)
        return self
    
    def predict(self, horizon: int) -> dd.DataFrame:
        return self.model.predict(self.ts, horizon, self.data_divisions)
    
    def __repr__(self) -> str:
        return f'DistributedForecast(model={self.model}, flow_config={self.flow_config})'

The `DistributedForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `LGBMForecast` and `XGBForecast` which are just wrappers around `DaskLGBMRegressor` and `DaskXGBRegressor` that add a `model_` property to get the trained model from them and send it to every worker to perform the predictions step.

The different things that you need to use `DistributedForecast` (as opposed to `Forecast`) are:
1. You need to set up a `dask.distributed.Client`. If this client is connected to a remote cluster then the process will run there.
2. Your data needs to be a `dask.dataframe`.
3. You need to use a model that implements distributed training.

## Example

### 1. Set up a client

In [None]:
client = Client(n_workers=2)

### 2. Set up your data. 

The data is given as a `dask.dataframe`, it is recommended that you make sure that each time serie is only in one partition and that you have as many partitions as you have workers.

The required input format is the same as for `Forecast`, except that it's a `dask.dataframe` instead of a `pandas.dataframe`.

In [None]:
series = generate_daily_series(100, n_static_features=2)
partitioned_series = dd.from_pandas(series, npartitions=2)
partitioned_series

As in the local version (`Forecast`) a flow configuration is required. 

### Flow configuration

In [None]:
flow_config = dict(
    freq='D',
    lags=[7, 14],
    lag_transforms={
        1: [
            expanding_mean
        ],
        7: [
            (rolling_mean, 7), 
            (rolling_std, 7),
        ]
    },
    date_features=['dayofweek', 'month', 'year'],
    num_threads=2,
)

### Training

In [None]:
fcst = DistributedForecast(LGBMForecast(), flow_config)
fcst.fit(partitioned_series)

### Predictions

In [None]:
fcst.predict(7)

## Validation

In [None]:
def get_last_n_mask(serie, n):
    mask = np.full_like(serie, False, dtype=bool)
    mask[-n:] = True
    return mask

test_size = 14
valid_mask = series.groupby('unique_id')['y'].transform(get_last_n_mask, test_size)
train = dd.from_pandas(series[~valid_mask], npartitions=2)
y_valid = series[valid_mask].set_index('ds', append=True)[['y']]

In [None]:
def eval_preds(train, model, cats2int=False):
    if cats2int:
        train = train.copy()
        for col in train.select_dtypes(include='category'):
            train[col] = train[col].cat.codes
            
    fcst = DistributedForecast(model, flow_config)
    fcst.fit(train)
    preds = fcst.predict(test_size).compute()

    evals = y_valid.join(preds.set_index('ds', append=True))
    evals['sq_err'] = (evals['y'] - evals['y_pred'])**2
    mse = evals.groupby('unique_id')['sq_err'].mean().mean()
    print(f'MSE: {mse:.1f}')
    
    valid_sum = y_valid.groupby('ds').sum()
    preds_sum = preds.groupby('ds')['y_pred'].sum()
    valid_sum.join(preds_sum).plot(marker='.', figsize=(16, 6)); 

In [None]:
eval_preds(train, LGBMForecast())

In [None]:
eval_preds(train, XGBForecast(), cats2int=True)

In [None]:
client.close()