In [None]:
#default_exp forecast

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Forecast

> Full pipeline encapsulation

In [None]:
#export
from typing import Callable, Dict, Generator

import pandas as pd

from mlforecast.core import predictions_flow, preprocessing_flow
from mlforecast.utils import backtest_splits


In [None]:
#hide
import warnings
warnings.simplefilter('ignore', UserWarning)

from nbdev import show_doc

In [None]:
#export
class Forecast:
    """Full pipeline encapsulation. 
    
    Takes a model (scikit-learn compatible regressor) and a flow configuration
    and runs all the forecasting pipeline."""
    
    def __init__(self, model, flow_config: Dict):
        self.model = model
        self.flow_config = flow_config
        
    def __repr__(self):
        return f'Forecast(model={self.model}, flow_config={self.flow_config})'

    def preprocess(
        self, data: pd.DataFrame, prep_fn: Callable = preprocessing_flow
    ) -> pd.DataFrame:
        """Calls `prep_fn(data, **self.flow_config)`, saves the resulting `TimeSeries` object
        for the forecasting step and returns the `data` with the features added."""
        self.ts, series_df = prep_fn(data, **self.flow_config)
        return series_df

    def fit(
        self, data: pd.DataFrame, prep_fn: Callable = preprocessing_flow, **fit_kwargs
    ) -> 'Forecast':
        """Preprocesses `data` and fits `model` to it."""
        series_df = self.preprocess(data, prep_fn)
        X, y = series_df.drop(columns=['ds', 'y']), series_df.y.values
        del series_df
        self.model.fit(X, y, **fit_kwargs)
        return self

    def predict(
        self, horizon: int, predict_fn: Callable = predictions_flow, **predict_fn_kwargs
    ) -> pd.DataFrame:
        """Compute the predictions for the next `horizon` steps."""
        return predict_fn(self.ts, self.model, horizon, **predict_fn_kwargs)

    def backtest(
        self,
        data,
        n_windows: int,
        window_size: int,
        predict_fn: Callable = predictions_flow,
        **predict_fn_kwargs,
    ) -> Generator[pd.DataFrame, None, None]:
        """Creates `n_windows` splits of `window_size` from `data`, trains the model
        on the training set, predicts the window and merges the actuals and the predictions
        in a dataframe.

        Returns a generator to the dataframes containing the datestamps, actual values 
        and predictions."""
        for train, valid in backtest_splits(data, n_windows, window_size):
            self.fit(train)
            y_pred = self.predict(window_size, predict_fn, **predict_fn_kwargs)
            y_valid = valid[['ds', 'y']]
            result = y_valid.merge(y_pred, on=['unique_id', 'ds'], how='left')
            yield result


The `Forecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing the predictions). It tries to mimic the scikit-learn API.

## Example
This shows an example with simulated data, for a real world example you can check the [M5 example](https://www.kaggle.com/lemuz90/m5-mlforecast).

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean, rolling_std

from mlforecast.utils import generate_daily_series

In order to forecast some time series you need a dataframe with `unique_id` as the index (which contains the identifier for each time serie), a `ds` column with the datestamps and a `y` column with the series values.

### Data setup

In [None]:
series = generate_daily_series(100, equal_ends=True, n_static_features=2)
series

Whatever extra columns you have, like `static_0` and `static_1` here are considered to be static and are replicated when constructing the features for the next datestamp. You can disable this by passing `static_features` to the flow configuration, which will only keep the columns you define there as static. Keep in mind that they will still be used for training, so you'll have to write your own function to populate the non-static columns and pass it to the `Forecast.predict` method. This is shown in the [M5 example](https://www.kaggle.com/lemuz90/m5-mlforecast).

### Model

The model can be any scikit-learn compatible regressor.

In [None]:
model = lgb.LGBMRegressor()

### Flow configuration

The next step is definining the flow configuration as a dictionary that will be unpacked in the call to our preprocessing function, which is`preprocessing_flow` by default.

In [None]:
preprocessing_flow?

Here where we say that:
1. Our series have daily frequency.
2. We want to use lag 7 and lag 14 as features.
3. We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 7
   * rolling standard deviation of the lag 7 over a window of size 7
4. We want to use dayofweek, month and year as date features.
5. We want to perform the preprocessing and the forecasting steps using 2 threads.

In [None]:
flow_config = dict(
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 7)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=2,
)

### Training

Once we have this configuration we just instantiate a `Forecast` object with the model we want to use and this configuration.

In [None]:
fcst = Forecast(model, flow_config)

From this point we have two options:

1. Preprocess the data and fit our model using all of it.
2. Preprocess the data and get it back as a dataframe to do some custom splitting or adding additional features. And then training the model.

#### 1. Using all the data

In [None]:
show_doc(Forecast.fit)

Calling `.fit` on our data performs the preprocessing and uses all the data to train our model.

In [None]:
fcst.fit(series)

In [None]:
fcst.model.fitted_

#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `.preprocess` on our data.

In [None]:
show_doc(Forecast.preprocess)

In [None]:
features_df = fcst.preprocess(series)
features_df.head()

This is useful if we want to inspect the data the model will be trained, adding additional features or performing some custom train-valid split. Here we perform a 80-20 split.

In [None]:
np.random.seed(0)

train_mask = np.random.rand(features_df.shape[0]) < 0.8
train, valid = features_df[train_mask], features_df[~train_mask]
X_train, y_train = train.drop(columns=['ds', 'y']), train.y
X_valid, y_valid = valid.drop(columns=['ds', 'y']), valid.y

If we do this we must "manually" train our model calling `Forecast.model.fit`.

In [None]:
fcst.model.fit(X_train, y_train, 
               eval_set=[(X_train, y_train), (X_valid, y_valid)],
               eval_metric='rmse',
               verbose=0)

In [None]:
for lab in fcst.model.evals_result_.keys():
    plt.plot(fcst.model.evals_result_[lab]['rmse'], label=lab)
plt.legend();

### Forecasting
Once we have this fitted model, we can compute the forecasts for the next 7 timesteps.

In [None]:
show_doc(Forecast.predict)

In [None]:
fcst.predict(7)

This uses each prediction as the next value of the target and updates all features accordingly. The static features were propagated and the date features were computed using each new datestamp.

### Backtesting

If we would like to know how good our forecast will be for a specific model and set of features then we can perform backtesting. What backtesting does is take our data and split it in two parts, where the first part is used for training and the second one for validation. Since the data is time dependant we usually take the last *x* observations from our data as the validation set.

This process is implemented in `Forecast.backtest`, which takes our data and performs the process described above for `n_windows` times where each window is of size `window_size`. For example, if we have 100 samples and we want to perform 2 backtests each of size 14, the splits will be as follows:

1. Train: 1 to 72. Validation: 73 to 86.
2. Train: 1 to 86. Validation: 87 to 100.

In [None]:
show_doc(Forecast.backtest)

In [None]:
n_windows = 2
window_size = 14

fcst = Forecast(model, flow_config)
backtest_results = fcst.backtest(series, n_windows, window_size)

`Forecast.backtest` returns a generator that yields the results of each window one at a time.

In [None]:
window1_result = next(backtest_results)
window1_result

In [None]:
window2_result = next(backtest_results)
results = pd.concat([window1_result, window2_result])

We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = results.groupby('ds').sum()
agg_results.plot();

We can include some more context by using the values in the training set.

In [None]:
history = series[series.ds < agg_results.index.min()]
agg_history = history.groupby('ds')[['y']].sum().tail(50)
agg_history.append(agg_results).plot();

Note that since the backtest results are returned as a generator we can also compute a single statistic on them and not keep the whole results in memory.

In [None]:
backtest_results = fcst.backtest(series, n_windows, window_size)

losses = [mean_squared_error(res.y, res.y_pred) for res in backtest_results]
np.round(losses, 2)