In [None]:
#all_distributed

In [None]:
#default_exp distributed.core

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Distributed core

> Building blocks for the distributed pipeline.

In [None]:
#export
import operator
from typing import Dict, List, Optional, Tuple

import dask.dataframe as dd
from dask.distributed import Client, default_client, futures_of, wait

from mlforecast.core import TimeSeries


In [None]:
import pandas as pd
from nbdev import *
from window_ops.rolling import rolling_mean

from mlforecast.utils import generate_daily_series

In [None]:
#exporti
def _fit_transform(init_kwargs, data):
    ts = TimeSeries(**init_kwargs)
    df = ts.fit_transform(data)
    return ts, df


def _predict(ts, model, horizon, **kwargs):
    return ts.predict(model, horizon, **kwargs)


In [None]:
#export
class DistributedTimeSeries:
    def __init__(
        self,
        freq: str = 'D',
        lags: List[int] = [],
        lag_transforms: Dict[int, List[Tuple]] = {},
        date_features: List[str] = [],
        num_threads: Optional[int] = None,
        client: Optional[Client] = None,
    ):
        self._init_kwargs = dict(
            freq=freq,
            lags=lags,
            lag_transforms=lag_transforms,
            date_features=date_features,
            num_threads=num_threads,
        )
        self.client = client or default_client()
        self._ts_for_repr = TimeSeries(
            freq, lags, lag_transforms, date_features, num_threads
        )
        
    def fit_transform(self, data: dd.DataFrame) -> dd.DataFrame:
        self.data_divisions = data.divisions
        data = self.client.persist(data)
        wait(data)
        partition_futures = futures_of(data)
        self.ts = []
        df_futures = []
        for part_future in partition_futures:
            future = self.client.submit(
                _fit_transform, self._init_kwargs, part_future, pure=False
            )
            ts_future = self.client.submit(operator.itemgetter(0), future)
            df_future = self.client.submit(operator.itemgetter(1), future)
            self.ts.append(ts_future)
            df_futures.append(df_future)
        meta = self.client.submit(lambda x: x.head(0), df_futures[0]).result()
        return dd.from_delayed(df_futures, meta=meta)
    
    def predict(self, model, horizon: int, **kwargs) -> dd.DataFrame:
        model_future = self.client.scatter(model, broadcast=True)
        predictions_futures = [
            self.client.submit(_predict, ts_future, model_future, horizon, **kwargs)
            for ts_future in self.ts
        ]
        meta = self.client.submit(lambda x: x.head(), predictions_futures[0]).result()
        return dd.from_delayed(
            predictions_futures, meta=meta, divisions=self.data_divisions
        )

    def __repr__(self):
        ts_repr = self._ts_for_repr.__repr__()
        return f'Distributed{ts_repr}'


The `distributed_preprocess` takes a `dask.dataframe` and applies the preprocessing function (`preprocessing_flow` by default) to each partition independently, generating as many `TimeSeries` objects as there are partitions in the dataframe and another `dask.dataframe` with the features included in order to perform distributed training using `dask`.

It is recommended that you have as many partitions as you have workers, so each worker performs one preprocessing task (optionally using multi-threading).

In [None]:
client = Client(n_workers=2)

In [None]:
series = generate_daily_series(100, n_static_features=2)
series

In [None]:
partitioned_series = dd.from_pandas(series, npartitions=6)
partitioned_series

In [None]:
config = dict(
    freq='D',
    lags=[7, 14],
    lag_transforms={
        7 : [(rolling_mean, 7)],
        14: [(rolling_mean, 7)],
    },
    date_features=['dayofweek'],
    num_threads=1,
)

dts = DistributedTimeSeries(**config)
train_ddf = dts.fit_transform(partitioned_series).compute()

ts = TimeSeries(**config)
local_df = ts.fit_transform(series)
assert train_ddf.equals(local_df)

next_feats_futures = client.map(lambda ts: ts.update_features(), dts.ts)
next_feats = pd.concat(client.gather(next_feats_futures))
local_upd = ts.update_features()
assert next_feats.equals(local_upd)

In [None]:
class DummyModel:
    def predict(self, X):
        return X['lag-7'].values
    
horizon = 7
model = DummyModel()
dts = DistributedTimeSeries(**config)
dts.fit_transform(partitioned_series)
preds = dts.predict(model, horizon).compute()

In [None]:
ts = TimeSeries(**config)
ts.fit_transform(series)
local_preds = ts.predict(model, horizon)

In [None]:
assert preds.equals(local_preds)

In [None]:
client.close()