In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# Exogenous features
> Use exogenous regressors for training and predicting

In [None]:
import lightgbm as lgb
import pandas as pd
from mlforecast import MLForecast
from mlforecast.utils import generate_daily_series, generate_prices_for_series
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean

## Data setup

In [None]:
series = generate_daily_series(
    100, equal_ends=True, n_static_features=2
).rename(columns={'static_1': 'product_id'})
series.head()

Unnamed: 0,unique_id,ds,y,static_0,product_id
0,id_00,2000-10-05,39.811983,79,45
1,id_00,2000-10-06,103.274013,79,45
2,id_00,2000-10-07,176.574744,79,45
3,id_00,2000-10-08,258.9879,79,45
4,id_00,2000-10-09,344.940404,79,45


In mlforecast the required columns are the series identifier, time and target. Any extra columns you have, like `static_0` and `product_id` here are considered to be static and are replicated when constructing the features for the next timestamp. You can disable this by passing `static_features` to `MLForecast.preprocess` or `MLForecast.fit`, which will only keep the columns you define there as static. Keep in mind that all features in your input dataframe will be used for training, so you'll have to provide the future values of exogenous features to `MLForecast.predict` through the `X_df` argument.

Consider the following example. Suppose that we have a prices catalog for each id and date.

In [None]:
prices_catalog = generate_prices_for_series(series)
prices_catalog.head()

Unnamed: 0,ds,unique_id,price
0,2000-10-05,id_00,0.548814
1,2000-10-06,id_00,0.715189
2,2000-10-07,id_00,0.602763
3,2000-10-08,id_00,0.544883
4,2000-10-09,id_00,0.423655


And that you have already merged these prices into your series dataframe.

In [None]:
series_with_prices = series.merge(prices_catalog, how='left')
series_with_prices.head()

Unnamed: 0,unique_id,ds,y,static_0,product_id,price
0,id_00,2000-10-05,39.811983,79,45,0.548814
1,id_00,2000-10-06,103.274013,79,45,0.715189
2,id_00,2000-10-07,176.574744,79,45,0.602763
3,id_00,2000-10-08,258.9879,79,45,0.544883
4,id_00,2000-10-09,344.940404,79,45,0.423655


This dataframe will be passed to `MLForecast.fit` (or `MLForecast.preprocess`). However, since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static.

In [None]:
fcst = MLForecast(
    models=lgb.LGBMRegressor(n_jobs=1, random_state=0, verbosity=-1),
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=2,
)
fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])

MLForecast(models=[LGBMRegressor], freq=<Day>, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=2)

The features used for training are stored in `MLForecast.ts.features_order_`. As you can see `price` was used for training.

In [None]:
fcst.ts.features_order_

['static_0',
 'product_id',
 'price',
 'lag7',
 'expanding_mean_lag1',
 'rolling_mean_lag7_window_size14',
 'dayofweek',
 'month']

So in order to update the price in each timestep we just call `MLForecast.predict` with our forecast horizon and pass the prices catalog through `X_df`.

In [None]:
preds = fcst.predict(h=7, X_df=prices_catalog)
preds.head()

Unnamed: 0,unique_id,ds,LGBMRegressor
0,id_00,2001-05-15,418.930093
1,id_00,2001-05-16,499.487368
2,id_00,2001-05-17,20.321885
3,id_00,2001-05-18,102.310778
4,id_00,2001-05-19,185.340281


In [None]:
#| hide
import numpy as np

from mlforecast.callbacks import SaveFeatures

In [None]:
#| hide
# check that the price was passed correctly
first_pred_date = series_with_prices['ds'].max() + pd.offsets.Day()
save_feats = SaveFeatures()
fcst.predict(7, X_df=prices_catalog, before_predict_callback=save_feats)
for h, actual in enumerate(save_feats._inputs):
    expected = prices_catalog.loc[prices_catalog['ds'].eq(first_pred_date + h * pd.offsets.Day())]
    np.testing.assert_allclose(
        actual['price'].values,
        expected['price'].values,
    )

In [None]:
#|hide
preds2 = fcst.predict(7, X_df=prices_catalog)
preds3 = fcst.predict(7, new_df=series_with_prices, X_df=prices_catalog)

pd.testing.assert_frame_equal(preds, preds2)
pd.testing.assert_frame_equal(preds, preds3)

In [None]:
#| hide
# test we can compute cross validation with
# exogenous variables without adding extra information
fcst.cross_validation(
    series_with_prices,
    h=7,
    n_windows=2,
    static_features=['static_0', 'product_id'],
);