In [None]:
#default_exp forecast

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Forecast

> Full pipeline encapsulation

In [None]:
#export
from typing import Callable, Dict

import pandas as pd

from mlforecast.core import predictions_flow, preprocessing_flow

In [None]:
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from window_ops.rolling import *
from window_ops.expanding import *
from window_ops.ewm import ewm_mean

from mlforecast.utils import generate_daily_series

In [None]:
#export
class Forecast:
    """Full pipeline encapsulation. 
    
    Takes a model (scikit-learn compatible regressor) and a flow configuration."""
    
    def __init__(self, model, flow_config: Dict):
        self.model = model
        self.flow_config = flow_config
        
    def preprocess(self, data: pd.DataFrame, prep_fn: Callable = preprocessing_flow) -> pd.DataFrame:
        """Apply the transformations defined in the flow configuration."""
        self.ts, series_df = prep_fn(data, **self.flow_config)
        return series_df
    
    def fit(self, data: pd.DataFrame, prep_fn: Callable = preprocessing_flow, **kwargs) -> 'Forecast':
        """Perform the preprocessing and fit the model."""
        series_df = self.preprocess(data, prep_fn)
        X, y = series_df.drop(columns=['ds', 'y']), series_df.y.values
        del series_df
        self.model.fit(X, y, **kwargs)
        return self
    
    def predict(self, horizon: int, predict_fn: Callable = predictions_flow) -> pd.DataFrame:
        """Compute the predictions for the next `horizon` steps."""
        return predict_fn(self.ts, self.model, horizon)
    
    def __repr__(self):
        return f'Forecast(model={self.model}, flow_config={self.flow_config})'

The `Forecast` class is a high level abstraction that encapsulates all the steps in the pipeline (preprocessing, fitting the model and computing the predictions). It tries to mimic the scikit-learn API.

In order to perform forecasting for some time series you need a dataframe with `unique_id` as the index (which contains the identifier for each time serie), a `ds` column with the datestamps and a `y` column with the series values.

In [None]:
series = generate_daily_series(100, n_static_features=2)
series

Whatever extra columns you have, like `static_0` and `static_1` here are considered to be static and are replicated when constructing the features for the next datestamp. You can disable this by passing `static_features` to the flow configuration, which will only keep the columns you define there as static. Keep in mind that they will still be used for training, so you'll have to write your own function to populate the non-static columns and pass it to the `Forecast.predict` method. This is shown in the M5 example.

The next step is definining the flow configuration, where we say that:
1. Our series have daily frequency.
2. We want to use lag 7 and lag 14 as features.
3. We want to use the expanding mean of the lag 1 as well as the rolling mean and the rolling standard deviation of the lag 7 over a window of size 7) as features.
4. We want to use dayofweek, month and year as date features.
5. We want to perform the preprocessing and the forecasting using 2 threads.

In [None]:
flow_config = dict(
    freq='D',
    lags=[7, 14],
    lag_transforms={
        1: [
            expanding_mean
        ],
        7: [
            (rolling_mean, 7), 
            (rolling_std, 7),
        ]
    },
    date_features=['dayofweek', 'month', 'year'],
    num_threads=2,
)

Once we have this configuration we just instantiate a `Forecast` object with the model we want to use and this configuration.

In [None]:
fcst = Forecast(lgb.LGBMRegressor(), flow_config)

And we fit it to our series

In [None]:
fcst.fit(series)

Once we have this fitted model, we can compute the forecasts for the next 7 timesteps

In [None]:
fcst.predict(7)

This uses each prediction as the next value of the target and updates all features accordingly. The static features were propagated and the date features were computed using each new datestamp.

## Validation

If we want to validate how our model performs using this rolling predictions scheme, we can first split our data removing the last observations from each serie and keeping it as a validation set, compute the forecast and then compare these results.

In [None]:
def get_last_n_mask(serie, n):
    mask = np.full_like(serie, False, dtype=bool)
    mask[-n:] = True
    return mask

test_size = 14
valid_mask = series.groupby('unique_id')['y'].transform(get_last_n_mask, test_size)
train = series[~valid_mask]
y_valid = series[valid_mask].set_index('ds', append=True)[['y']]

In [None]:
def eval_preds(train, model, cats2int=False):
    if cats2int:
        train = train.copy()
        for col in train.select_dtypes(include='category'):
            train[col] = train[col].cat.codes
            
    fcst = Forecast(model, flow_config)
    fcst.fit(train)
    preds = fcst.predict(test_size)

    evals = y_valid.join(preds.set_index('ds', append=True))
    evals['sq_err'] = (evals['y'] - evals['y_pred'])**2
    mse = evals.groupby('unique_id')['sq_err'].mean().mean()
    print(f'MSE: {mse:.1f}')
    
    valid_sum = y_valid.groupby('ds').sum()
    preds_sum = preds.groupby('ds')['y_pred'].sum()
    valid_sum.join(preds_sum).plot(marker='.', figsize=(16, 6));    

In [None]:
eval_preds(train, lgb.LGBMRegressor())

In [None]:
eval_preds(train, xgb.XGBRegressor(), cats2int=True)