In [None]:
#default_exp forecast

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Forecast

> Full pipeline encapsulation

In [None]:
#export
from typing import Callable, Generator, List, Optional

import pandas as pd

from mlforecast.core import TimeSeries
from mlforecast.utils import backtest_splits


In [None]:
#hide
import warnings
warnings.simplefilter('ignore', UserWarning)

from nbdev import show_doc

In [None]:
#export
class Forecast:
    """Full pipeline encapsulation. 
    
    Takes a model (scikit-learn compatible regressor) and TimeSeries
    and runs all the forecasting pipeline."""
    
    def __init__(self, model, ts: TimeSeries):
        self.model = model
        self.ts = ts
        
    def __repr__(self):
        return f'Forecast(model={self.model}, ts={self.ts})'

    def preprocess(
        self,
        data: pd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> pd.DataFrame:
        return self.ts.fit_transform(data, static_features, dropna, keep_last_n)

    def fit(
        self,
        data: pd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,            
        **fit_kwargs,
    ) -> 'Forecast':
        """Preprocesses `data` and fits `model` to it."""
        series_df = self.preprocess(data, static_features, dropna, keep_last_n)
        X, y = series_df.drop(columns=['ds', 'y']), series_df.y.values
        del series_df
        self.model.fit(X, y, **fit_kwargs)
        return self

    def predict(
        self,
        horizon: int,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,        
        predict_fn: Optional[Callable] = None,
        **predict_fn_kwargs,
    ) -> pd.DataFrame:
        """Compute the predictions for the next `horizon` steps.
        
        `predict_fn(model, new_x, features_order, **predict_fn_kwargs)` is called in every timestep, where:
        `model` is the trained model.
        `new_x` is a dataframe with the same format as the input plus the computed features.
        `features_order` is the list of column names that were used in the training step.
        """
        return self.ts.predict(
            self.model, horizon, dynamic_dfs, predict_fn, **predict_fn_kwargs
        )

    def backtest(
        self,
        data: pd.DataFrame,
        n_windows: int,
        window_size: int,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
        dynamic_dfs: Optional[List[pd.DataFrame]] = None,
        predict_fn: Optional[Callable] = None,
        **predict_fn_kwargs,
    ) -> Generator[pd.DataFrame, None, None]:
        """Creates `n_windows` splits of `window_size` from `data`, trains the model
        on the training set, predicts the window and merges the actuals and the predictions
        in a dataframe.

        Returns a generator to the dataframes containing the datestamps, actual values 
        and predictions."""
        for train, valid in backtest_splits(data, n_windows, window_size):
            self.fit(train, static_features, dropna, keep_last_n)
            y_pred = self.predict(
                window_size, dynamic_dfs, predict_fn, **predict_fn_kwargs
            )
            y_valid = valid[['ds', 'y']]
            result = y_valid.merge(y_pred, on=['unique_id', 'ds'], how='left')
            yield result


The `Forecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing the predictions). It tries to mimic the scikit-learn API.

## Example
This shows an example with simulated data, for a real world example you can check the [M5 example](https://www.kaggle.com/lemuz90/m5-mlforecast).

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean, rolling_std

from mlforecast.utils import generate_daily_series, generate_prices_for_series

In order to forecast some time series you need a dataframe with `unique_id` as the index (which contains the identifier for each time serie), a `ds` column with the datestamps and a `y` column with the series values.

### Data setup

In [None]:
series = generate_daily_series(100, equal_ends=True, n_static_features=2)
series

Whatever extra columns you have, like `static_0` and `static_1` here are considered to be static and are replicated when constructing the features for the next datestamp. You can disable this by passing `static_features` to `Forecast.preprocess` or `Forecast.fit` , which will only keep the columns you define there as static. Keep in mind that they will still be used for training, so you'll have to define a class that inherits from `TimeSeries` and override the `predict` method. This is shown in the [M5 example](https://www.kaggle.com/lemuz90/m5-mlforecast).

### Model

The model can be any scikit-learn compatible regressor.

In [None]:
model = lgb.LGBMRegressor()

### TimeSeries
The other component needed in `Forecast` is a `TimeSeries` object, which defines the features to be computed. 

In [None]:
ts = TimeSeries(
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=2,    
)
ts

Here where we say that:
* Our series have daily frequency.
* We want to use lag 7 as a feature
* We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 14
* We want to use dayofweek and month as date features.
* We want to perform the preprocessing and the forecasting steps using 2 threads.

### Training

Once we have this setup we just instantiate a `Forecast` object with the model and the time series.

In [None]:
fcst = Forecast(model, ts)

From this point we have two options:

1. Preprocess the data and fit our model using all of it.
2. Preprocess the data and get it back as a dataframe to do some custom splitting or adding additional features, then training the model.

#### 1. Using all the data

In [None]:
show_doc(Forecast.fit)

Calling `.fit` on our data performs the preprocessing and uses all the data to train our model.

In [None]:
fcst.fit(series)

In [None]:
fcst.model.fitted_

#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `.preprocess` on our data.

In [None]:
show_doc(Forecast.preprocess)

In [None]:
features_df = fcst.preprocess(series)
features_df.head()

This is useful if we want to inspect the data the model will be trained, adding additional features or performing some custom train-valid split. Here we perform a 80-20 split.

In [None]:
np.random.seed(0)

train_mask = np.random.rand(features_df.shape[0]) < 0.8
train, valid = features_df[train_mask], features_df[~train_mask]
X_train, y_train = train.drop(columns=['ds', 'y']), train.y
X_valid, y_valid = valid.drop(columns=['ds', 'y']), valid.y

If we do this we must "manually" train our model calling `Forecast.model.fit`.

In [None]:
fcst.model.fit(X_train, y_train, 
               eval_set=[(X_train, y_train), (X_valid, y_valid)],
               eval_metric='rmse',
               verbose=0)

In [None]:
for lab in fcst.model.evals_result_.keys():
    plt.plot(fcst.model.evals_result_[lab]['rmse'], label=lab)
plt.legend();

### Forecasting

In [None]:
show_doc(Forecast.predict)

Once we have this fitted model, we can compute the forecasts for the next 7 timesteps.

In [None]:
fcst.predict(7)

This uses each prediction as the next value of the target and updates all features accordingly. The static features were propagated and the date features were computed using each new datestamp.

In [None]:
#hide
preds = fcst.predict(7)
preds2 = fcst.predict(7)

np.testing.assert_equal(preds['y_pred'].values, preds2['y_pred'].values)

#### Dynamic features

By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can pass them as a list to the `dynamic_dfs` argument of `Forecast.predict`, which will call `pd.DataFrame.merge` on each of them in order.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
dynamic_series = series.rename(columns={'static_1': 'product_id'})
prices_catalog = generate_prices_for_series(dynamic_series)
prices_catalog

And you have already merged these prices into your series dataframe.

In [None]:
series_with_prices = dynamic_series.reset_index().merge(prices_catalog, how='left')
series_with_prices.set_index('unique_id', inplace=True)
series_with_prices

This dataframe will be passed to `Forecast.fit` (or `Forecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
fcst = Forecast(model, ts)
fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])

The features used for training are stored in `Forecast.ts.features_order_`, as you can see `price` was used for training.

In [None]:
fcst.ts.features_order_

So in order to update the price in each timestep we just call `Forecast.predict` with our forecast horizon and pass the prices catalog as a dynamic dataframe.

In [None]:
preds = fcst.predict(7, dynamic_dfs=[prices_catalog])
preds

#### Custom predictions
As you may have noticed `Forecast.predict` can take a `predict_fn` and `predict_fn_kwargs`. By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features you can pass them as a list to `Forecast.predict` in the `dynamic_dfs` argument. However, if you want to do something else, you can define your own function which will take:

* The trained model.
* The updated features (static + transformations + date features).
* A list of dataframes with the dynamic features.
* The order of the features the model was trained on.
* Additional keyword arguments passed to `Forecast.predict`.

Here's an example:

Suppose that we want to scale our predictions so that our series are updated with these scaled values. We can achieve that with the following.

In [None]:
def scaling_predict_fn(
    model,
    new_x,
    dynamic_dfs,
    features_order,
    scale_factor,
) -> np.ndarray:
    new_x = new_x.drop(columns='ds')
    predictions = model.predict(new_x)
    return scale_factor * predictions

And now we just pass this function to `Forecast.predict`.

In [None]:
fcst = Forecast(lgb.LGBMRegressor(n_estimators=10), TimeSeries(lags=[7]))
fcst.fit(series)

scale_factor = 1.1
preds = fcst.predict(2, predict_fn=scaling_predict_fn, scale_factor=scale_factor)
preds

In [None]:
#hide
fcst.ts._predict_setup()

for attr in ('head', 'tail'):
    new_x = fcst.ts._update_features().drop(columns='ds')
    original_preds = fcst.model.predict(new_x)
    
    expected = scale_factor * original_preds
    actual = getattr(preds.groupby('unique_id')['y_pred'], attr)(1).values
    np.testing.assert_equal(expected, actual)
    
    fcst.ts._update_y(actual)

### Backtesting

If we would like to know how good our forecast will be for a specific model and set of features then we can perform backtesting. What backtesting does is take our data and split it in two parts, where the first part is used for training and the second one for validation. Since the data is time dependant we usually take the last *x* observations from our data as the validation set.

This process is implemented in `Forecast.backtest`, which takes our data and performs the process described above for `n_windows` times where each window is of size `window_size`. For example, if we have 100 samples and we want to perform 2 backtests each of size 14, the splits will be as follows:

1. Train: 1 to 72. Validation: 73 to 86.
2. Train: 1 to 86. Validation: 87 to 100.

In [None]:
show_doc(Forecast.backtest)

In [None]:
n_windows = 2
window_size = 14

fcst = Forecast(model, ts)
backtest_results = fcst.backtest(series, n_windows, window_size)

`Forecast.backtest` returns a generator that yields the results of each window one at a time.

In [None]:
window1_result = next(backtest_results)
window1_result

In [None]:
window2_result = next(backtest_results)
results = pd.concat([window1_result, window2_result])

We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = results.groupby('ds').sum()
agg_results.plot();

We can include some more context by using the values in the training set.

In [None]:
history = series[series.ds < agg_results.index.min()]
agg_history = history.groupby('ds')[['y']].sum().tail(50)
agg_history.append(agg_results).plot();

Note that since the backtest results are returned as a generator we can also compute a single statistic on them and not keep the whole results in memory.

In [None]:
backtest_results = fcst.backtest(series, n_windows, window_size)

losses = [mean_squared_error(res.y, res.y_pred) for res in backtest_results]
np.round(losses, 2)

In [None]:
#hide
model = lgb.LGBMRegressor()
ts = TimeSeries(lags=[7, 14])
fcst = Forecast(model, ts)
backtest_results = list(fcst.backtest(series, n_windows, window_size))
manual_results = []
for train, valid in backtest_splits(series, n_windows, window_size):
    model = lgb.LGBMRegressor()
    ts = TimeSeries(lags=[7, 14])
    fcst = Forecast(model, ts).fit(train)
    pred = fcst.predict(window_size)
    res = valid[['ds', 'y']].copy()
    manual_results.append(res.merge(pred, on=['unique_id', 'ds'], how='left'))
backtest_results = pd.concat(backtest_results)
manual_results = pd.concat(manual_results)
pd.testing.assert_frame_equal(backtest_results, manual_results)