In [None]:
#all_distributed

In [None]:
#default_exp distributed.core

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Distributed core

> Building blocks for the distributed pipeline.

In [None]:
#export
import operator
from typing import Callable, Dict, List, Optional, Tuple

import dask.dataframe as dd
from dask.distributed import Client, Future, default_client, futures_of, wait

from mlforecast.core import preprocessing_flow


In [None]:
import pandas as pd
from nbdev import *
from window_ops.rolling import rolling_mean

from mlforecast.utils import generate_daily_series

In [None]:
#export
def distributed_preprocess(data: dd.DataFrame,
                           config: Dict,
                           client: Optional[Client] = None,
                           flow: Callable = preprocessing_flow) -> Tuple[List[Future], dd.DataFrame]:
    """Applies `flow(partition, **config)` to every partition of `data`.
    
    Returns futures pointing to the `TimeSeries` objects generated from each partition
    and a dask dataframe for training a distributed model."""
    client = client or default_client()
    
    data = client.persist(data)
    wait(data)
    partition_futures = futures_of(data)
    results_futures = client.map(flow, partition_futures, **config)
    
    # pure is here in case we modify in-place one of the TimeSeries and want to recompute it.
    ts_futures = client.map(operator.itemgetter(0), results_futures, pure=False)
    
    df_futures = client.map(operator.itemgetter(1), results_futures)
    meta = client.submit(lambda x: x.head(), df_futures[0]).result()
    train_ddf = dd.from_delayed(df_futures, meta=meta)
    
    return ts_futures, train_ddf

The `distributed_preprocess` takes a `dask.dataframe` and applies the preprocessing function (`preprocessing_flow` by default) to each partition independently, generating as many `TimeSeries` objects as there are partitions in the dataframe and another `dask.dataframe` with the features included in order to perform distributed training using `dask`.

It is recommended that you have as many partitions as you have workers, so each worker performs one preprocessing task (optionally using multi-threading).

In [None]:
client = Client(n_workers=2)

In [None]:
series = generate_daily_series(100, n_static_features=2)
series

In [None]:
partitioned_series = dd.from_pandas(series, npartitions=2)
partitioned_series

In [None]:
config = dict(
    freq='D',
    lags=[7, 14],
    lag_transforms={
        7: [(rolling_mean, 7)],
        14: [(rolling_mean, 7)],
    },
    date_features=['dayofweek'],
    num_threads=2,
    keep_last_n=20
)

In [None]:
ts_futures, train_ddf = distributed_preprocess(partitioned_series, config)

local_ts, local_df = preprocessing_flow(series, **config)
assert train_ddf.compute().equals(local_df)
next_feats_futures = client.map(lambda x: x.update_features(), ts_futures)
next_feats = pd.concat(client.gather(next_feats_futures))
assert next_feats.equals(local_ts.update_features())

In [None]:
client.close()