In [None]:
#default_exp forecast

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Forecast

> Full pipeline encapsulation

In [None]:
#export
from typing import Callable, Generator, List, Optional

import pandas as pd

from mlforecast.core import TimeSeries, simple_predict
from mlforecast.utils import backtest_splits


In [None]:
#hide
import warnings
warnings.simplefilter('ignore', UserWarning)

from nbdev import show_doc

In [None]:
#export
class Forecast:
    """Full pipeline encapsulation. 
    
    Takes a model (scikit-learn compatible regressor) and TimeSeries
    and runs all the forecasting pipeline."""
    
    def __init__(self, model, ts: TimeSeries):
        self.model = model
        self.ts = ts
        
    def __repr__(self):
        return f'Forecast(model={self.model}, ts={self.ts})'

    def preprocess(
        self,
        data: pd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,
    ) -> pd.DataFrame:
        return self.ts.fit_transform(data, static_features, dropna, keep_last_n)

    def fit(
        self,
        data: pd.DataFrame,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,            
        **fit_kwargs,
    ) -> 'Forecast':
        """Preprocesses `data` and fits `model` to it."""
        series_df = self.preprocess(data, static_features, dropna, keep_last_n)
        X, y = series_df.drop(columns=['ds', 'y']), series_df.y.values
        self.train_features_ = X.columns
        del series_df
        self.model.fit(X, y, **fit_kwargs)
        return self

    def predict(
        self, horizon: int, predict_fn: Callable = simple_predict, **predict_fn_kwargs
    ) -> pd.DataFrame:
        """Compute the predictions for the next `horizon` steps.
        
        `predict_fn(model, new_x, features_order, **predict_fn_kwargs)` is called in every timestep, where:
        `model` is the trained model.
        `new_x` is a dataframe with the same format as the input plus the computed features.
        `features_order` is the list of column names that were used in the training step.
        """
        return self.ts.predict(self.model, horizon, predict_fn, **predict_fn_kwargs)

    def backtest(
        self,
        data: pd.DataFrame,
        n_windows: int,
        window_size: int,
        static_features: Optional[List[str]] = None,
        dropna: bool = True,
        keep_last_n: Optional[int] = None,        
        predict_fn: Callable = simple_predict,
        **predict_fn_kwargs,
    ) -> Generator[pd.DataFrame, None, None]:
        """Creates `n_windows` splits of `window_size` from `data`, trains the model
        on the training set, predicts the window and merges the actuals and the predictions
        in a dataframe.

        Returns a generator to the dataframes containing the datestamps, actual values 
        and predictions."""
        for train, valid in backtest_splits(data, n_windows, window_size):
            self.fit(train, static_features, dropna, keep_last_n)
            y_pred = self.predict(window_size, predict_fn, **predict_fn_kwargs)
            y_valid = valid[['ds', 'y']]
            result = y_valid.merge(y_pred, on=['unique_id', 'ds'], how='left')
            yield result


The `Forecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing the predictions). It tries to mimic the scikit-learn API.

## Example
This shows an example with simulated data, for a real world example you can check the [M5 example](https://www.kaggle.com/lemuz90/m5-mlforecast).

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean, rolling_std

from mlforecast.utils import generate_daily_series

In order to forecast some time series you need a dataframe with `unique_id` as the index (which contains the identifier for each time serie), a `ds` column with the datestamps and a `y` column with the series values.

### Data setup

In [None]:
series = generate_daily_series(100, equal_ends=True, n_static_features=2)
series

Whatever extra columns you have, like `static_0` and `static_1` here are considered to be static and are replicated when constructing the features for the next datestamp. You can disable this by passing `static_features` to `Forecast.preprocess` or `Forecast.fit` , which will only keep the columns you define there as static. Keep in mind that they will still be used for training, so you'll have to define a class that inherits from `TimeSeries` and override the `predict` method. This is shown in the [M5 example](https://www.kaggle.com/lemuz90/m5-mlforecast).

### Model

The model can be any scikit-learn compatible regressor.

In [None]:
model = lgb.LGBMRegressor()

### TimeSeries
The other component needed in `Forecast` is a `TimeSeries` object, which defines the features to be computed. 

In [None]:
ts = TimeSeries(
    freq='D',
    lags=[7],
    lag_transforms={
        1: [expanding_mean],
        7: [(rolling_mean, 14)]
    },
    date_features=['dayofweek', 'month'],
    num_threads=2,    
)
ts

Here where we say that:
* Our series have daily frequency.
* We want to use lag 7 as a feature
* We want the lag transformations to be:
   * expanding mean of the lag 1
   * rolling mean of the lag 7 over a window of size 14
* We want to use dayofweek and month as date features.
* We want to perform the preprocessing and the forecasting steps using 2 threads.

### Training

Once we have this setup we just instantiate a `Forecast` object with the model and the time series.

In [None]:
fcst = Forecast(model, ts)

From this point we have two options:

1. Preprocess the data and fit our model using all of it.
2. Preprocess the data and get it back as a dataframe to do some custom splitting or adding additional features, then training the model.

#### 1. Using all the data

In [None]:
show_doc(Forecast.fit)

Calling `.fit` on our data performs the preprocessing and uses all the data to train our model.

In [None]:
fcst.fit(series)

In [None]:
fcst.model.fitted_

#### 2. Preprocess and train

If we only want to perform the preprocessing step we call `.preprocess` on our data.

In [None]:
show_doc(Forecast.preprocess)

In [None]:
features_df = fcst.preprocess(series)
features_df.head()

This is useful if we want to inspect the data the model will be trained, adding additional features or performing some custom train-valid split. Here we perform a 80-20 split.

In [None]:
np.random.seed(0)

train_mask = np.random.rand(features_df.shape[0]) < 0.8
train, valid = features_df[train_mask], features_df[~train_mask]
X_train, y_train = train.drop(columns=['ds', 'y']), train.y
X_valid, y_valid = valid.drop(columns=['ds', 'y']), valid.y

If we do this we must "manually" train our model calling `Forecast.model.fit`.

In [None]:
fcst.model.fit(X_train, y_train, 
               eval_set=[(X_train, y_train), (X_valid, y_valid)],
               eval_metric='rmse',
               verbose=0)

In [None]:
for lab in fcst.model.evals_result_.keys():
    plt.plot(fcst.model.evals_result_[lab]['rmse'], label=lab)
plt.legend();

### Forecasting

In [None]:
show_doc(Forecast.predict)

Once we have this fitted model, we can compute the forecasts for the next 7 timesteps.

In [None]:
fcst.predict(7)

This uses each prediction as the next value of the target and updates all features accordingly. The static features were propagated and the date features were computed using each new datestamp.

#### Custom predictions
As you may have noticed `Forecast.predict` can take a `predict_fn` and `predict_fn_kwargs`. By default the predict method repeats the static features and updates the transformations and the date features. If you have dynamic features like prices or a calendar with holidays you can write a function that takes the trained model, the updated features and the features order and do custom stuff there, then pass it to `Forecast.predict`.

Here's an example:

Suppose that we have a `product_id` column and we have a catalog for prices based on that `product_id` and the date.

In [None]:
#hide
dynamic_series = series.rename(columns={'static_1': 'product_id'})
day_offset = pd.tseries.frequencies.Day()
starts_ends = dynamic_series.groupby('product_id')['ds'].agg([min, max])
dfs = []
for idx, (start, end) in starts_ends.iterrows():
    product_df = pd.DataFrame(
        {
            'product_id': idx,
            'price': np.random.rand((end - start).days + 8)
        },
        index=pd.date_range(start, end + 7 * day_offset, name='ds')
    )
    dfs.append(product_df)
prices_catalog = pd.concat(dfs)
series_with_prices = dynamic_series.reset_index().merge(prices_catalog, on=['ds', 'product_id'], how='left')
series_with_prices = series_with_prices.set_index('unique_id')

In [None]:
prices_catalog

And you have already merged these prices into your series dataframe.

In [None]:
series_with_prices

This dataframe will be passed to `Forecast.fit` (or `Forecast.preprocess`), however since the price is dynamic we have to tell that method that only `static_0` and `product_id` are static and we'll have to update `price` in every timestep, which basically involves merging the updated features with the prices catalog.

In [None]:
fcst = Forecast(model, ts)
fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])
fcst.ts.features_order_

The features used for training are stored in `Forecast.ts.features_order_`, as you can see `price` was used for training.

In order to compute the predictions we'll need to update `price` in every timestep. `Forecast.predict` calls `predict_fn(model, new_x, features_order, **kwargs)` on every timestep, so we can pass a custom function there to do our join. `new_x` will have the same format as our input data plus the computed features, except it won't have `y` and the dynamic columns (`price` in this case).

So what we have to do is take `new_x`, join with the prices catalog, sort by `unique_id` (in case the join modifies the row order) and then take only the columns that were used for training (this drops the `ds` and `unique_id` columns and arranges them in the correct order before passing the dataframe to the model). This can be achieved with the following function:

In [None]:
def my_predict_fn(model, new_x, features_order):
    new_x = new_x.reset_index('unique_id')  # to sort later
    new_x = new_x.merge(prices_catalog, on=['ds', 'product_id'])
    new_x = new_x.sort_values('unique_id')
    new_x = new_x[features_order]  # features used for training
    return model.predict(new_x)

In [None]:
#hide
class PredictPrice:
    def fit(self, X, y=None):
        self.features_ = X.columns
        
    def predict(self, X):
        assert self.features_.equals(X.columns)
        return X['price']

dummy_model = PredictPrice()
dummy_fcst = Forecast(dummy_model, ts)
dummy_fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])
dummy_preds = dummy_fcst.predict(1, my_predict_fn)

expected_prices = series_with_prices.reset_index()[['unique_id', 'product_id']].drop_duplicates()
expected_prices['ds'] = series_with_prices['ds'].max() + fcst.ts.freq
expected_prices = expected_prices.reset_index()
expected_prices = expected_prices.merge(prices_catalog, on=['product_id', 'ds'], how='left')
expected_prices = expected_prices.set_index('unique_id')[['ds', 'price']]

assert dummy_preds.rename(columns={'y_pred': 'price'}).equals(expected_prices)

And now we just pass this function to `Forecast.predict`.

In [None]:
preds = fcst.predict(7, my_predict_fn)
preds

### Backtesting

If we would like to know how good our forecast will be for a specific model and set of features then we can perform backtesting. What backtesting does is take our data and split it in two parts, where the first part is used for training and the second one for validation. Since the data is time dependant we usually take the last *x* observations from our data as the validation set.

This process is implemented in `Forecast.backtest`, which takes our data and performs the process described above for `n_windows` times where each window is of size `window_size`. For example, if we have 100 samples and we want to perform 2 backtests each of size 14, the splits will be as follows:

1. Train: 1 to 72. Validation: 73 to 86.
2. Train: 1 to 86. Validation: 87 to 100.

In [None]:
show_doc(Forecast.backtest)

In [None]:
n_windows = 2
window_size = 14

fcst = Forecast(model, ts)
backtest_results = fcst.backtest(series, n_windows, window_size)

`Forecast.backtest` returns a generator that yields the results of each window one at a time.

In [None]:
window1_result = next(backtest_results)
window1_result

In [None]:
window2_result = next(backtest_results)
results = pd.concat([window1_result, window2_result])

We can aggregate these by date to get a rough estimate of how our model is doing.

In [None]:
agg_results = results.groupby('ds').sum()
agg_results.plot();

We can include some more context by using the values in the training set.

In [None]:
history = series[series.ds < agg_results.index.min()]
agg_history = history.groupby('ds')[['y']].sum().tail(50)
agg_history.append(agg_results).plot();

Note that since the backtest results are returned as a generator we can also compute a single statistic on them and not keep the whole results in memory.

In [None]:
backtest_results = fcst.backtest(series, n_windows, window_size)

losses = [mean_squared_error(res.y, res.y_pred) for res in backtest_results]
np.round(losses, 2)