In [None]:
#|default_exp distributed.forecast

In [None]:
#|hide
%load_ext autoreload
%autoreload 2

In [None]:
#|hide
from nbdev import show_doc
from sklearn import set_config

# Distributed forecast

> Distributed pipeline encapsulation

**This interface is only tested on Linux**

In [None]:
#|export
import warnings
from typing import Callable, Dict, List, Optional, Tuple

import dask.dataframe as dd
import numpy as np
import pandas as pd
from dask.distributed import Client, default_client
from sklearn.base import clone

from mlforecast import Forecast, TimeSeries
from mlforecast.distributed.core import DistributedTimeSeries
from mlforecast.utils import backtest_splits

In [None]:
#|hide
set_config(display='text')
warnings.filterwarnings('ignore')

In [None]:
#|export
class DistributedForecast:
    """Distributed pipeline encapsulation."""
    
    def __init__(
        self,
        models,  # model or list of mlforecast.distributed.models
        freq: Optional[str] = None,  # pandas offset alias, e.g. D, W, M. Don't set if you're using integer times.
        lags: List[int] = [],  # list of lags to use as features
        lag_transforms: Dict[int, List[Tuple]] = {},  # list of transformations to apply to each lag
        date_features: List[str] = [],  # list of names of pandas date attributes to use as features, e.g. dayofweek
        num_threads: int = 1,  # number of threads to use when computing lag features
        client: Optional[Client] = None  # dask client to use for computations
    ):
        if not isinstance(models, list):
            models = [clone(models)]
        self.models = [clone(m) for m in models]
        self.client = client or default_client()
        self.dts = DistributedTimeSeries(
            TimeSeries(freq, lags, lag_transforms, date_features, num_threads),
            self.client,
        )
        
    def __repr__(self) -> str:
        return (
            f'DistributedForecast(models=[{", ".join(m.__class__.__name__ for m in self.models)}], '
            f'freq={self.freq}, '
            f'lag_features={list(self.dts._base_ts.transforms.keys())}, '
            f'date_features={self.dts._base_ts.date_features}, '
            f'num_threads={self.dts._base_ts.num_threads}, '
            f'client={self.client})'
        )
    
    @property
    def freq(self):
        return self.dts._base_ts.freq

    def preprocess(
        self,
        data: dd.DataFrame,
        id_col: str = 'index',  # column that identifies each serie, it's recommended to have this as the index.
        time_col: str = 'ds',  # column with the timestamps
        target_col: str = 'y',  # column with the series values        
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,        
    ) -> dd.DataFrame:
        """Computes the transformations on each partition of `data` and
        saves the required information for the forecasting step.
        Returns a dask dataframe with the computed features."""
        if id_col in data:
            warnings.warn('It is recommended to have id_col as the index, since setting the index is a slow operation.')
            data = data.set_index(id_col)
            id_col = 'index'
        return self.dts.fit_transform(data, id_col, time_col, target_col, static_features, dropna, keep_last_n)

    def fit(
        self,
        data: dd.DataFrame,
        id_col: str = 'index',  # column that identifies each serie, it's recommended to have this as the index.
        time_col: str = 'ds',  # column with the timestamps
        target_col: str = 'y',  # column with the series values
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None) -> 'DistributedForecast':
        """Perform the preprocessing and fit the model."""
        train_ddf = self.preprocess(data, id_col, time_col, target_col, static_features, dropna, keep_last_n)
        X, y = train_ddf.drop(columns=[time_col, target_col]), train_ddf[target_col]
        self.models_ = []
        for i, model in enumerate(self.models):
            model = clone(model)
            model.client = self.client
            self.models_.append(model.fit(X, y))
        return self

    def predict(
        self,
        horizon: int,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        predict_fn: Optional[Callable] = None,
        **predict_fn_kwargs,
    ) -> dd.DataFrame:
        return self.dts.predict(
            [m.model_ for m in self.models_], horizon, dynamic_dfs, predict_fn, **predict_fn_kwargs
        )
    
    predict.__doc__ = Forecast.predict.__doc__
    
    def cross_validation(
        self,
        data: dd.DataFrame,  # time series
        n_windows: int,  # number of windows to evaluate
        window_size: int,  # test size in each window
        id_col: str = 'index',  # column that identifies each serie, can also be the index.
        time_col: str = 'ds',  # column with the timestamps
        target_col: str = 'y',  # column with the series values
        static_features: Optional[List[str]] = None,  # column names of the features that don't change in time
        dropna: bool = True,  # drop rows with missing values created by lags
        keep_last_n: Optional[int] = None,  # keep only this many observations of each serie for computing the updates
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,  # future values for dynamic features
        predict_fn: Optional[Callable] = None,  # custom function to compute predictions
        **predict_fn_kwargs,  # additional arguments passed to predict_fn
    ):
        """Creates `n_windows` splits of `window_size` from `data`, trains the model
        on the training set, predicts the window and merges the actuals and the predictions
        in a dataframe.

        Returns a dataframe containing the datestamps, actual values, train ends and predictions."""
        results = []
        self.cv_models_ = []
        if id_col != 'index':
            data = data.set_index(id_col)

        def renames(df):
            mapper = {time_col: 'ds', target_col: 'y'}
            df = df.rename(columns=mapper, copy=False)
            df.index.name = 'unique_id'
            return df
        data = data.map_partitions(renames)

        if np.issubdtype(data['ds'].dtype.type, np.integer):
            freq = 1
        else:
            freq = self.freq
        for train_end, train, valid in backtest_splits(data, n_windows, window_size, freq):
            self.fit(train, 'index', 'ds', 'y', static_features, dropna, keep_last_n)
            self.cv_models_.append(self.models_)
            y_pred = self.predict(
                window_size, dynamic_dfs, predict_fn, **predict_fn_kwargs
            )
            result = valid[['ds', 'y']].copy()
            result['cutoff'] = train_end
            
            def merge_fn(res, pred):
                return res.merge(pred, on=['unique_id', 'ds'], how='left')
            meta = {**result.dtypes.to_dict(), **y_pred.dtypes.to_dict()}
            result = result.map_partitions(merge_fn, y_pred, align_dataframes=False, meta=meta)
            if id_col != 'index':
                result = result.reset_index()
            result = result.rename(columns={'ds': time_col, 'y': target_col, 'unique_id': id_col})
            results.append(result)

        return dd.concat(results)

The `DistributedForecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing predictions) and applies them in a distributed way.

## Example
This shows an example with simulated data, for a real world example in a remote cluster you can check the [M5 distributed example](https://www.kaggle.com/lemuz90/m5-mlforecast-distributed).

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

from mlforecast.utils import backtest_splits, generate_daily_series, generate_prices_for_series
from mlforecast.distributed.models.lgb import LGBMForecast
from mlforecast.distributed.models.xgb import XGBForecast

The different things that you need to use `DistributedForecast` (as opposed to `Forecast`) are:

1. You need to set up a `dask.distributed.Client`. If this client is connected to a remote cluster then the process will run there.
2. Your data needs to be a `dask.dataframe.DataFrame`.
3. You need to use a model that implements distributed training (either XGBForecast or LGBMForecast).

### Client setup

Here we define a client that connects to a `dask.distributed.LocalCluster`, however it could be any other kind of cluster.

In [None]:
client = Client(n_workers=2, threads_per_worker=1)

### Data setup

The data is given as a `dask.dataframe.DataFrame`, you need to make sure that each time serie is only in one partition and it is recommended that you have as many partitions as you have workers. If you have more partitions than workers make sure to set `num_threads=1` to avoid having nested parallelism.

The required input format is the same as for `Forecast`, except that it's a `dask.dataframe.DataFrame` instead of a `pandas.Dataframe`.

In [None]:
series = generate_daily_series(100, n_static_features=2, equal_ends=True, static_as_categorical=False)
partitioned_series = dd.from_pandas(series, npartitions=10)
partitioned_series

Unnamed: 0_level_0,ds,y,static_0,static_1
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id_00,datetime64[ns],float64,int64,int64
id_11,...,...,...,...
...,...,...,...,...
id_90,...,...,...,...
id_99,...,...,...,...


### Models
In order to perform distributed forecasting, we need to use a model that is able to train in a distributed way using `dask`. The current implementations are in `LGBMForecast` and `XGBForecast` which are just wrappers around `lightgbm.dask.DaskLGBMRegressor` and `xgboost.dask.DaskXGBRegressor` that add a `model_` property to get the trained model from them and send it to every worker to perform the predictions step.

In [None]:
models = [XGBForecast(random_state=0), LGBMForecast(random_state=0)]

### Training
Once we have our models we instantiate a `DistributedForecast` object defining our features.

In [None]:
fcst = DistributedForecast(
    models=models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
fcst

DistributedForecast(models=[XGBForecast, LGBMForecast], freq=<Day>, lag_features=['lag-7', 'expanding_mean_lag-1', 'rolling_mean_lag-7_window_size-14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:39927' processes=2 threads=2, memory=15.50 GiB>)

Here where we say that:

* Our series have daily frequency.
* We want to use lag 7 as a feature
* We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 14
* We want to use dayofweek and month as date features.
* We want to perform the preprocessing and the forecasting steps using 1 thread, because we have 10 partitions and 2 workers.

From this point we have two options:

1. Compute the features and fit our models.
2. Compute the features and get them back as a dataframe to do some custom splitting or adding additional features, then training the models.

#### 1. Using all the data

In [None]:
show_doc(DistributedForecast.fit)

---

### DistributedForecast.fit

>      DistributedForecast.fit (data:dask.dataframe.core.DataFrame,
>                               id_col:str='index', time_col:str='ds',
>                               target_col:str='y',
>                               static_features:Optional[List[str]]=None,
>                               dropna:bool=True,
>                               keep_last_n:Optional[int]=None)

Perform the preprocessing and fit the model.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | DataFrame |  |  |
| id_col | str | index | column that identifies each serie, it's recommended to have this as the index. |
| time_col | str | ds | column with the timestamps |
| target_col | str | y | column with the series values |
| static_features | typing.Optional[typing.List[str]] | None |  |
| dropna | bool | True |  |
| keep_last_n | typing.Optional[int] | None |  |
| **Returns** | **DistributedForecast** |  |  |

Calling `fit` on our data computes the features independently for each partition and performs distributed training.

In [None]:
fcst.fit(partitioned_series)

[20:14:15] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 0
[20:14:15] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 37517...
[LightGBM] [Info] Binding port 37517 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 35081...
[LightGBM] [Info] Binding port 35081 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Local rank: 1, total number of machines: 2


DistributedForecast(models=[XGBForecast, LGBMForecast], freq=<Day>, lag_features=['lag-7', 'expanding_mean_lag-1', 'rolling_mean_lag-7_window_size-14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:39927' processes=2 threads=2, memory=15.50 GiB>)

### Forecasting

In [None]:
show_doc(DistributedForecast.predict)

---

### DistributedForecast.predict

>      DistributedForecast.predict (horizon:int,
>                                   dynamic_dfs:Optional[List[pandas.core.frame.
>                                   DataFrame]]=None,
>                                   predict_fn:Optional[Callable]=None,
>                                   **predict_fn_kwargs)

Compute the predictions for the next `horizon` steps.

`predict_fn(model, new_x, dynamic_dfs, features_order, **predict_fn_kwargs)` is called in every timestep, where:
`model` is the trained model.
`new_x` is a dataframe with the same format as the input plus the computed features.
`dynamic_dfs` is a list containing the dynamic dataframes.
`features_order` is the list of column names that were used in the training step.

Once we have our fitted models we can compute the predictions for the next 7 timesteps.

In [None]:
preds = fcst.predict(7)
preds

Unnamed: 0_level_0,ds,XGBRegressor,LGBMRegressor
npartitions=10,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_00,datetime64[ns],float32,float64
id_11,...,...,...
...,...,...,...
id_90,...,...,...
id_99,...,...,...


In [None]:
#|hide
preds = preds.compute()
preds2 = fcst.predict(7).compute()
pd.testing.assert_frame_equal(preds, preds2)

In [None]:
##|hide
non_std_series = partitioned_series.copy()
non_std_series['ds'] = non_std_series.map_partitions(lambda part: part.groupby('unique_id').cumcount())
non_std_series = non_std_series.reset_index().rename(columns={'ds': 'time', 'y': 'value', 'unique_id': 'some_id'})
flow_params = dict(
    models=[XGBForecast(random_state=0)],
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    num_threads=1,
)
fcst = DistributedForecast(freq='D', **flow_params)
preds = fcst.fit(partitioned_series).predict(7).compute()
fcst2 = DistributedForecast(**flow_params)
fcst2.preprocess(non_std_series, id_col='some_id', time_col='time', target_col='value')
fcst2.models_ = fcst.models_  # distributed training can end up with different fits
non_std_preds = fcst2.predict(7).compute()
non_std_preds.index.name = 'unique_id'
# non_std_preds.index.name = 'unique_id'
pd.testing.assert_frame_equal(preds.drop(columns='ds'), non_std_preds.drop(columns='time'))

[20:14:19] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 0
[20:14:19] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 1


#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `preprocess` with our data.

In [None]:
show_doc(DistributedForecast.preprocess)

---

### DistributedForecast.preprocess

>      DistributedForecast.preprocess (data:dask.dataframe.core.DataFrame,
>                                      id_col:str='index', time_col:str='ds',
>                                      target_col:str='y',
>                                      static_features:Optional[List[str]]=None,
>                                      dropna:bool=True,
>                                      keep_last_n:Optional[int]=None)

Computes the transformations on each partition of `data` and
saves the required information for the forecasting step.
Returns a dask dataframe with the computed features.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | DataFrame |  |  |
| id_col | str | index | column that identifies each serie, it's recommended to have this as the index. |
| time_col | str | ds | column with the timestamps |
| target_col | str | y | column with the series values |
| static_features | typing.Optional[typing.List[str]] | None |  |
| dropna | bool | True |  |
| keep_last_n | typing.Optional[int] | None |  |
| **Returns** | **DataFrame** |  |  |

In [None]:
features_ddf = fcst.preprocess(partitioned_series)
features_ddf.head()

Unnamed: 0_level_0,ds,y,static_0,static_1,lag-7,expanding_mean_lag-1,rolling_mean_lag-7_window_size-14
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id_00,2000-10-25,497.668437,79,45,506.946385,250.013666,263.200596
id_00,2000-10-26,39.183469,79,45,38.8778,261.80675,263.133868
id_00,2000-10-27,94.377779,79,45,115.127739,251.68751,263.980563
id_00,2000-10-28,179.235741,79,45,180.384975,244.847957,264.252723
id_00,2000-10-29,267.546447,79,45,242.228588,242.114114,263.055629


This is useful if we want to inspect the data the model will be trained, adding additional features or performing some custom train-valid split. Here we perform a 80-20 split.

In [None]:
rng = np.random.RandomState(0)

def mask_as_series(df):
    return pd.Series(rng.rand(df.shape[0]) < 0.8, index=df.index)

train_mask = features_ddf.map_partitions(mask_as_series)
train, valid = features_ddf[train_mask], features_ddf[~train_mask]
X_train, y_train = train.drop(columns=['ds', 'y']), train.y
X_valid, y_valid = valid.drop(columns=['ds', 'y']), valid.y

If we do this we must "manually" train our models and assing them to the `models_` attribute.

In [None]:
fitted = models[0].fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_metric='rmse',
    verbose=False,
)
fcst.models_ = [fitted]

[20:14:21] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 0
[20:14:21] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 1


We can see the RMSE by iteration for each set.

In [None]:
pd.DataFrame({
    k: np.round(fcst.models_[0].evals_result_[k]['rmse'], 2)
    for k in ('validation_0', 'validation_1')
})

Unnamed: 0,validation_0,validation_1
0,159.12,158.98
1,111.81,111.59
2,78.76,78.50
3,55.74,55.44
4,39.80,39.52
...,...,...
95,7.76,9.64
96,7.74,9.64
97,7.73,9.64
98,7.70,9.63


#### Dynamic features

By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can pass them as a list to the `dynamic_dfs` argument of `Forecast.predict`, which will call `pd.DataFrame.merge` on each of them in order.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
prices_catalog

Unnamed: 0,ds,product_id,price
0,2000-06-09,1,0.548814
1,2000-06-10,1,0.715189
2,2000-06-11,1,0.602763
3,2000-06-12,1,0.544883
4,2000-06-13,1,0.423655
...,...,...,...
20180,2001-05-17,99,0.223520
20181,2001-05-18,99,0.446104
20182,2001-05-19,99,0.044783
20183,2001-05-20,99,0.483216


And you have already merged these prices into your series dataframe.

In [None]:
dynamic_series = partitioned_series.rename(columns={'static_1': 'product_id'})
dynamic_series = dynamic_series.reset_index()
series_with_prices = dynamic_series.merge(prices_catalog, how='left')
series_with_prices = series_with_prices.set_index('unique_id', sorted=True)
series_with_prices.head()

Unnamed: 0_level_0,ds,y,static_0,product_id,price
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id_00,2000-10-05,39.811983,79,45,0.570826
id_00,2000-10-06,103.274013,79,45,0.260562
id_00,2000-10-07,176.574744,79,45,0.274048
id_00,2000-10-08,258.9879,79,45,0.433878
id_00,2000-10-09,344.940404,79,45,0.653738


This dataframe will be passed to `DistributedForecast.fit` (or `DistributedForecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
fcst = DistributedForecast(
    models,
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=1,
)
series_with_prices = series_with_prices
fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])

[20:14:22] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 0
[20:14:22] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 59265...
[LightGBM] [Info] Binding port 59265 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 48485...
[LightGBM] [Info] Binding port 48485 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


DistributedForecast(models=[XGBForecast, LGBMForecast], freq=<Day>, lag_features=['lag-7', 'expanding_mean_lag-1', 'rolling_mean_lag-7_window_size-14'], date_features=['dayofweek', 'month'], num_threads=1, client=<Client: 'tcp://127.0.0.1:39927' processes=2 threads=2, memory=15.50 GiB>)

So in order to update the price in each timestep we just call `DistributedForecast.predict` with our forecast horizon and pass the prices catalog as a dynamic dataframe.

In [None]:
preds = fcst.predict(7, dynamic_dfs=[prices_catalog])
preds.compute()

Unnamed: 0_level_0,ds,XGBRegressor,LGBMRegressor
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_00,2001-05-15,417.990967,431.913959
id_00,2001-05-16,501.354095,502.759544
id_00,2001-05-17,23.075724,19.760072
id_00,2001-05-18,102.658348,101.563043
id_00,2001-05-19,178.322525,187.509875
...,...,...,...
id_99,2001-05-17,442.907318,442.109678
id_99,2001-05-18,18.299212,20.339088
id_99,2001-05-19,90.773727,91.373232
id_99,2001-05-20,154.936508,152.878285


#### Custom predictions

If you want to do something like scaling the predictions you can define a function and pass it to `DistributedForecast.predict` as described in <a href="/mlforecast/forecast.html#Custom-predictions">Custom predictions</a>.

### Cross validation
Refer to `Forecast.cross_validation`.

In [None]:
show_doc(DistributedForecast.cross_validation)

---

### DistributedForecast.cross_validation

>      DistributedForecast.cross_validation (data:dask.dataframe.core.DataFrame,
>                                            n_windows:int, window_size:int,
>                                            id_col:str='index',
>                                            time_col:str='ds',
>                                            target_col:str='y', static_features
>                                            :Optional[List[str]]=None,
>                                            dropna:bool=True,
>                                            keep_last_n:Optional[int]=None, dyn
>                                            amic_dfs:Optional[List[pandas.core.
>                                            frame.DataFrame]]=None,
>                                            predict_fn:Optional[Callable]=None,
>                                            **predict_fn_kwargs)

Creates `n_windows` splits of `window_size` from `data`, trains the model
on the training set, predicts the window and merges the actuals and the predictions
in a dataframe.

Returns a dataframe containing the datestamps, actual values, train ends and predictions.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| data | DataFrame |  | time series |
| n_windows | int |  | number of windows to evaluate |
| window_size | int |  | test size in each window |
| id_col | str | index | column that identifies each serie, can also be the index. |
| time_col | str | ds | column with the timestamps |
| target_col | str | y | column with the series values |
| static_features | typing.Optional[typing.List[str]] | None | column names of the features that don't change in time |
| dropna | bool | True | drop rows with missing values created by lags |
| keep_last_n | typing.Optional[int] | None | keep only this many observations of each serie for computing the updates |
| dynamic_dfs | typing.Optional[typing.List[pandas.core.frame.DataFrame]] | None | future values for dynamic features |
| predict_fn | typing.Optional[typing.Callable] | None | custom function to compute predictions |
| predict_fn_kwargs |  |  |  |

In [None]:
n_windows = 2
window_size = 14

backtest_results = fcst.cross_validation(partitioned_series, n_windows, window_size)
backtest_results

[20:14:24] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 0
[20:14:24] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 57797...
[LightGBM] [Info] Binding port 57797 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 48891...
[LightGBM] [Info] Binding port 48891 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


[20:14:26] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 0
[20:14:26] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 1


Finding random open ports for workers
[LightGBM] [Info] Trying to bind port 53565...
[LightGBM] [Info] Binding port 53565 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Trying to bind port 36255...
[LightGBM] [Info] Binding port 36255 succeeded
[LightGBM] [Info] Listening...
[LightGBM] [Info] Connected to rank 1
[LightGBM] [Info] Local rank: 0, total number of machines: 2
[LightGBM] [Info] Connected to rank 0
[LightGBM] [Info] Local rank: 1, total number of machines: 2


Unnamed: 0_level_0,ds,y,cutoff,XGBRegressor,LGBMRegressor
npartitions=20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],float64,datetime64[ns],float32,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = backtest_results.compute().groupby('ds').mean()
agg_results.head()

Unnamed: 0_level_0,y,XGBRegressor,LGBMRegressor
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-04-17,161.232312,161.527481,162.094722
2001-04-18,152.139197,151.559235,151.311011
2001-04-19,169.856989,171.319366,171.463442
2001-04-20,180.683402,180.790359,180.2096
2001-04-21,182.00609,181.458237,181.380985


We can also compute the error for each model.

In [None]:
def mse_from_dask_dataframe(ddf):
    mses = {}
    for model_name in ddf.columns.drop(['ds', 'y', 'cutoff']):
        mses[model_name] = (ddf['y'] - ddf[model_name]).pow(2).mean()
    return client.gather(client.compute(mses))

{k: round(v, 2) for k, v in mse_from_dask_dataframe(backtest_results).items()}

{'XGBRegressor': 85.71, 'LGBMRegressor': 91.6}

In [None]:
#|hide
fcst = DistributedForecast(XGBForecast(random_state=0), lags=[7, 14])
backtest_results = fcst.cross_validation(
    non_std_series,
    n_windows,
    window_size,
    id_col='some_id',
    time_col='time',
    target_col='value',
    static_features=['static_0', 'static_1'],    
).compute()
renamer = {'some_id': 'unique_id', 'time': 'ds', 'value': 'y'}
backtest_results = backtest_results.rename(columns=renamer).set_index('unique_id')
renamed = non_std_series.rename(columns=renamer).set_index('unique_id')
cv_models = fcst.cv_models_
manual_results = []
for i, (cutoff, train, valid) in enumerate(backtest_splits(renamed, n_windows, window_size, 1)):
    fcst.preprocess(train)
    fcst.models_ = cv_models[i]
    pred = fcst.predict(window_size).compute()
    res = valid[['ds', 'y']].compute()
    res['cutoff'] = cutoff
    res = res.merge(pred, on=['unique_id', 'ds'], how='left')
    manual_results.append(res)
manual_results = pd.concat(manual_results)
pd.testing.assert_frame_equal(backtest_results, manual_results)

[20:14:29] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 0
[20:14:29] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 1
[20:14:30] task [xgboost.dask]:tcp://127.0.0.1:39163 got new rank 0
[20:14:30] task [xgboost.dask]:tcp://127.0.0.1:41527 got new rank 1


In [None]:
client.close()