In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# Quick start (distributed)

> Minimal example of distributed training with MLForecast

The `DistributedMLForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

The different things that you need to use `DistributedMLForecast` (as opposed to `MLForecast`) are:

1. You need to set up a cluster. We currently support dask, ray and spark.
2. Your data needs to be a distributed collection (dask, ray or spark dataframe).
3. You need to use a model that implements distributed training in your framework of choice, e.g. SynapseML for LightGBM in spark.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.distributed import DistributedMLForecast
from mlforecast.target_transforms import Differences
from mlforecast.utils import generate_daily_series, generate_prices_for_series

## Dask

In [None]:
import dask.dataframe as dd
from dask.distributed import Client

### Client setup

In [None]:
client = Client(n_workers=2, threads_per_worker=1)

Here we define a client that connects to a `dask.distributed.LocalCluster`, however it could be any other kind of cluster.

### Data setup

For dask, the data must be a `dask.dataframe.DataFrame`. You need to make sure that each time serie is only in one partition and it is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` to avoid having nested parallelism.

The required input format is the same as for `MLForecast`, except that it's a `dask.dataframe.DataFrame` instead of a `pandas.Dataframe`.

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True, static_as_categorical=False, min_length=500, max_length=1_000)
npartitions = 10
partitioned_series = dd.from_pandas(series.set_index('unique_id'), npartitions=npartitions)  # make sure we split by the id_col
partitioned_series = partitioned_series.map_partitions(lambda df: df.reset_index())
partitioned_series['unique_id'] = partitioned_series['unique_id'].astype(str)  # can't handle categoricals atm
partitioned_series

Unnamed: 0_level_0,unique_id,ds,y,static_0,static_1
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_00,object,datetime64[ns],float64,int64,int64
id_10,...,...,...,...,...
...,...,...,...,...,...
id_90,...,...,...,...,...
id_99,...,...,...,...,...


### Models
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `DaskLGBMForecast` and `DaskXGBForecast` which are just wrappers around the native implementations.

In [None]:
from mlforecast.distributed.models.dask.lgb import DaskLGBMForecast
from mlforecast.distributed.models.dask.xgb import DaskXGBForecast

In [None]:
models = [DaskXGBForecast(random_state=0), DaskLGBMForecast(random_state=0)]

### Training
Once we have our models we instantiate a `DistributedMLForecast` object defining our features. We can then call `fit` on this object passing our dask dataframe.

In [None]:
fcst = DistributedMLForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
    engine=client,
)
fcst.fit(partitioned_series)

In [None]:
#| hide
import fugue.api as fa
from fastcore.test import test_eq

In [None]:
#| hide
# function to test the partition_results data
# has the right size
def test_partition_results_size(fcst_object, expected_n_partitions):
    test_eq(
        fa.get_num_partitions(fcst_object.partition_results),
        expected_n_partitions,
    )
    test_eq(
        fa.count(fcst_object.partition_results),
        expected_n_partitions,
    )

In [None]:
#| hide
test_partition_results_size(fcst, npartitions)

In [None]:
#| hide
# test num_partitions works properly
num_partitions_test = 4
test_dd = dd.from_pandas(series, npartitions=num_partitions_test) # In this case we dont have to specify the column
test_dd['unique_id'] = test_dd['unique_id'].astype(str)
fcst_np = DistributedMLForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
    engine=client,
    num_partitions=num_partitions_test
)
fcst_np.fit(test_dd)
test_partition_results_size(fcst_np, num_partitions_test)
preds_np = fcst_np.predict(7).compute().sort_values(['unique_id', 'ds']).reset_index(drop=True)
preds = fcst.predict(7).compute().sort_values(['unique_id', 'ds']).reset_index(drop=True)
pd.testing.assert_frame_equal(
    preds[['unique_id', 'ds']], 
    preds_np[['unique_id', 'ds']], 
)

Once we have our fitted models we can compute the predictions for the next 7 timesteps.

### Forecasting

In [None]:
preds = fcst.predict(7)
preds.compute().head()

Unnamed: 0,unique_id,ds,DaskXGBForecast,DaskLGBMForecast
0,id_00,2002-09-27,18.676165,17.691819
1,id_00,2002-09-28,90.782455,90.198168
2,id_00,2002-09-29,169.503098,163.52241
3,id_00,2002-09-30,241.540359,244.411795
4,id_00,2002-10-01,315.643768,313.694593


In [None]:
#|hide
preds = preds.compute()
preds2 = fcst.predict(7).compute()
preds3 = fcst.predict(7, new_df=partitioned_series).compute()
pd.testing.assert_frame_equal(preds, preds2)
pd.testing.assert_frame_equal(preds, preds3)

### Cross validation

In [None]:
cv_res = fcst.cross_validation(
    partitioned_series,
    n_windows=3,
    h=14,
)
cv_res

In [None]:
cv_res.compute().head()

Unnamed: 0,unique_id,ds,DaskXGBForecast,DaskLGBMForecast,cutoff,y
0,id_00,2002-08-16,19.199099,18.868631,2002-08-15,11.878591
1,id_00,2002-08-17,93.734985,92.715766,2002-08-15,75.108162
2,id_00,2002-08-18,163.924606,167.22973,2002-08-15,175.278407
3,id_00,2002-08-19,245.957672,241.534768,2002-08-15,226.062025
4,id_00,2002-08-20,309.519073,306.687081,2002-08-15,318.433401


In [None]:
#| hide
from mlforecast.distributed.forecast import WindowInfo

In [None]:
#| hide
# input_size
input_size = 100
reduced_train = fcst._preprocess(
    partitioned_series,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    dropna=False,
    window_info=WindowInfo(
        n_windows=1,
        window_size=10,
        step_size=None,
        i_window=0,
        input_size=input_size,
    ),
)
assert reduced_train.groupby('unique_id').size().compute().max() == input_size

In [None]:
#| hide
cv_res_no_refit = fcst.cross_validation(
    partitioned_series,
    n_windows=3,
    h=14,
    refit=False
)
cv_results_df = cv_res.compute()
cv_results_no_refit_df = cv_res_no_refit.compute()
# test we recover the same "metadata"
models = ['DaskXGBForecast', 'DaskLGBMForecast']
test_eq(
    cv_results_no_refit_df.drop(columns=models),
    cv_results_df.drop(columns=models)
)

In [None]:
#|hide
non_std_series = partitioned_series.copy()
non_std_series['ds'] = non_std_series.map_partitions(lambda part: part.groupby('unique_id').cumcount())
non_std_series = non_std_series.rename(columns={'ds': 'time', 'y': 'value', 'unique_id': 'some_id'})
flow_params = dict(
    models=[DaskXGBForecast(random_state=0)],
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    num_threads=1,
)
fcst = DistributedMLForecast(freq='D', **flow_params)
fcst.fit(partitioned_series)
preds = fcst.predict(7).compute()
fcst2 = DistributedMLForecast(freq=1, **flow_params)
fcst2.preprocess(non_std_series, id_col='some_id', time_col='time', target_col='value')
fcst2.models_ = fcst.models_  # distributed training can end up with different fits
non_std_preds = fcst2.predict(7).compute()
pd.testing.assert_frame_equal(
    preds.drop(columns='ds'),
    non_std_preds.drop(columns='time').rename(columns={'some_id': 'unique_id'})
)

In [None]:
client.close()

## Spark

### Session setup

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = (
    SparkSession
    .builder
    .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.10.2")
    .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven")
    .getOrCreate()
)

### Data setup
For spark, the data must be a `pyspark DataFrame`. You need to make sure that each time serie is only in one partition (which you can do using `repartitionByRange`, for example) and it is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` to avoid having nested parallelism.

The required input format is the same as for `MLForecast`, i.e. it should have at least an id column, a time column and a target column.

In [None]:
numPartitions = 4
series = generate_daily_series(100, n_static_features=2, equal_ends=True, static_as_categorical=False)
spark_series = spark.createDataFrame(series).repartitionByRange(numPartitions, 'unique_id')

### Models
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `spark`. The current implementations are in `SparkLGBMForecast` and `SparkXGBForecast` which are just wrappers around the native implementations.

In [None]:
from mlforecast.distributed.models.spark.lgb import SparkLGBMForecast

models = [SparkLGBMForecast()]
try:
    from xgboost.spark import SparkXGBRegressor
    from mlforecast.distributed.models.spark.xgb import SparkXGBForecast
    models.append(SparkXGBForecast())
except ModuleNotFoundError:  # py < 38
    pass

### Training

In [None]:
fcst = DistributedMLForecast(
    models,
    freq='D',
    lags=[1],
    lag_transforms={
        1: [expanding_mean]
    },
    date_features=['dayofweek'],
)
fcst.fit(
    spark_series,
    static_features=['static_0', 'static_1'],
)

In [None]:
#| hide
test_partition_results_size(fcst, numPartitions)

                                                                                

In [None]:
#| hide
# test num_partitions works properly
test_spark_df = spark.createDataFrame(series)
num_partitions_test = 10
fcst_np = DistributedMLForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
    num_partitions=num_partitions_test,
)
fcst_np.fit(test_spark_df)
test_partition_results_size(fcst_np, num_partitions_test)
preds_np = fcst_np.predict(7).toPandas().sort_values(['unique_id', 'ds']).reset_index(drop=True)
preds = fcst.predict(7).toPandas().sort_values(['unique_id', 'ds']).reset_index(drop=True)
pd.testing.assert_frame_equal(
    preds[['unique_id', 'ds']], 
    preds_np[['unique_id', 'ds']], 
)

### Forecasting

In [None]:
preds = fcst.predict(14)

In [None]:
preds.toPandas().head()

  series = series.astype(t, copy=False)


Unnamed: 0,unique_id,ds,SparkLGBMForecast,SparkXGBForecast
0,id_00,2001-05-15,422.139843,421.606537
1,id_00,2001-05-16,497.180212,505.575836
2,id_00,2001-05-17,13.062478,15.462178
3,id_00,2001-05-18,100.601041,102.123245
4,id_00,2001-05-19,180.707848,182.308197


### Cross validation

In [None]:
cv_res = fcst.cross_validation(
    spark_series,
    n_windows=3,
    h=14,
).toPandas()

In [None]:
cv_res.head()

Unnamed: 0,unique_id,ds,SparkLGBMForecast,SparkXGBForecast,cutoff,y
0,id_04,2001-04-03,206.226409,202.242142,2001-04-02,216.937502
1,id_00,2001-04-03,415.538504,420.034576,2001-04-02,429.217687
2,id_00,2001-04-07,180.093252,179.349228,2001-04-02,192.303211
3,id_12,2001-04-07,143.923572,145.31871,2001-04-02,155.071484
4,id_19,2001-04-15,19.385093,74.153099,2001-04-02,14.420419


In [None]:
spark.stop()

## Ray

### Session setup

In [None]:
import ray
from ray.cluster_utils import Cluster

In [None]:
ray_cluster = Cluster(
    initialize_head=True,
    head_node_args={"num_cpus": 2}
)
ray.init(address=ray_cluster.address, ignore_reinit_error=True)
# add mock node to simulate a cluster
mock_node = ray_cluster.add_node(num_cpus=2)

### Data setup
For ray, the data must be a `ray DataFrame`. It is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` to avoid having nested parallelism.

The required input format is the same as for `MLForecast`, i.e. it should have at least an id column, a time column and a target column.

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True, static_as_categorical=False)
# we need noncategory unique_id
series['unique_id'] = series['unique_id'].astype(str)
ray_series = ray.data.from_pandas(series)

### Models
The ray integration allows to include `lightgbm` (`RayLGBMRegressor`), and `xgboost` (`RayXGBRegressor`).

In [None]:
from mlforecast.distributed.models.ray.lgb import RayLGBMForecast
from mlforecast.distributed.models.ray.xgb import RayXGBForecast

In [None]:
models = [
    RayLGBMForecast(),
    RayXGBForecast(),
]

### Training

To control the number of partitions to use using Ray, we have to include `num_partitions` to `DistributedMLForecast`.

In [None]:
num_partitions = 4

In [None]:
fcst = DistributedMLForecast(
    models,
    freq='D',
    lags=[1],
    lag_transforms={
        1: [expanding_mean]
    },
    date_features=['dayofweek'],
    num_partitions=num_partitions, # Use num_partitions to reduce overhead
)
fcst.fit(
    ray_series,
    static_features=['static_0', 'static_1'],
)

In [None]:
#| hide
test_partition_results_size(fcst, num_partitions)

In [None]:
#| hide
# test num_partitions works properly
# In this case we test that the default behavior 
# for ray datasets works as expected
fcst_np = DistributedMLForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
fcst_np.fit(ray_series)
# we dont use test_partition_results_size
# since the number of objects is different 
# from the number of partitions
test_eq(fa.count(fcst_np.partition_results), 100) # number of series
preds_np = fcst_np.predict(7).to_pandas().sort_values(['unique_id', 'ds']).reset_index(drop=True)
preds = fcst.predict(7).to_pandas().sort_values(['unique_id', 'ds']).reset_index(drop=True)
pd.testing.assert_frame_equal(
    preds[['unique_id', 'ds']], 
    preds_np[['unique_id', 'ds']], 
)

### Forecasting

In [None]:
preds = fcst.predict(14).to_pandas()

In [None]:
preds.head()

Unnamed: 0,unique_id,ds,RayLGBMForecast,RayXGBForecast
0,id_00,2001-05-15,422.139843,419.180908
1,id_00,2001-05-16,497.180212,502.074249
2,id_00,2001-05-17,13.062478,16.981802
3,id_00,2001-05-18,100.601041,102.311279
4,id_00,2001-05-19,180.707848,181.406143


### Cross validation

In [None]:
cv_res = fcst.cross_validation(
    ray_series,
    n_windows=3,
    h=14,
).to_pandas()

In [None]:
cv_res.head()

Unnamed: 0,unique_id,ds,RayLGBMForecast,RayXGBForecast,cutoff,y
0,id_01,2001-05-01,124.758319,122.131401,2001-04-30,117.876479
1,id_01,2001-05-02,145.041,149.217972,2001-04-30,153.394375
2,id_01,2001-05-03,178.838681,178.600784,2001-04-30,175.337772
3,id_01,2001-05-04,27.212783,10.926006,2001-04-30,13.202898
4,id_01,2001-05-05,56.624979,38.081158,2001-04-30,30.20309


In [None]:
ray.shutdown()