In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp feature_engineering

# Feature engineering
> Generate features for downstream models

In [None]:
#| export
from typing import Tuple

from utilsforecast.compat import DataFrame, pl, pl_DataFrame
from utilsforecast.processing import (
    drop_index_if_pandas,
    horizontal_concat,
    maybe_compute_sort_indices,
    take_rows,
    vertical_concat,
)

from statsforecast import StatsForecast
from statsforecast.models import MSTL, _predict_mstl_components

In [None]:
#| export
def mstl_decomposition(
    df: DataFrame,
    model: MSTL,
    freq: str,
    h: int,    
) -> Tuple[DataFrame, DataFrame]:
    """Decompose the series into trend and seasonal using the MSTL model.

    Parameters
    ----------
    df : pandas or polars DataFrame
        DataFrame with columns [`unique_id`, `ds`, `y`].
    model : statsforecast MSTL
        Model to use for the decomposition.
    freq : str
        Frequency of the data (pandas alias)
    h : int
        Forecast horizon.        

    Returns
    -------
    train_df : pandas or polars DataFrame
        Original dataframe with the 'trend' and 'seasonal' columns added.
    X_df : pandas or polars DataFrame
        Future dataframe to be provided to the predict method through `X_df`.
    """
    if not isinstance(model, MSTL):
        raise ValueError(f'`model` must be an MSTL instance, got {type(model)}')
    sort_idxs = maybe_compute_sort_indices(df, 'unique_id', 'ds')
    if sort_idxs is not None:
        df = take_rows(df, sort_idxs)
    df = drop_index_if_pandas(df)
    sf = StatsForecast(models=[model], freq=freq)
    sf.fit(df=df)
    X_df = sf._make_future_df(h=h)
    train_features = []
    future_features = []
    df_constructor = type(df)
    seas_cols = [c for c in sf.fitted_[0, 0].model_.columns if c.startswith('seasonal')]
    for fitted_model in sf.fitted_[:, 0]:
        train_features.append(fitted_model.model_[['trend'] + seas_cols])
        seas_comp = _predict_mstl_components(fitted_model.model_, h, model.season_length)
        future_df = df_constructor({
            'trend': fitted_model.trend_forecaster.predict(h)['mean'],
            **dict(zip(seas_cols, seas_comp.T)),
        })
        future_features.append(future_df)
    train_features = vertical_concat(train_features, match_categories=False)
    if isinstance(df, pl_DataFrame):
        train_features = pl.from_pandas(train_features)
    train_df = horizontal_concat([df, train_features])
    future_features = vertical_concat(future_features, match_categories=False)
    X_df = horizontal_concat([X_df, future_features])
    return train_df, X_df

In [None]:
import pandas as pd
from fastcore.test import test_fail
from utilsforecast.losses import smape

from statsforecast.models import Naive
from statsforecast.utils import generate_series

In [None]:
series = generate_series(10, freq='D')
series['unique_id'] = series['unique_id'].astype('int64')

In [None]:
test_fail(lambda: mstl_decomposition(series, Naive(), 'D', 14), contains='must be an MSTL instance')

In [None]:
horizon = 14
model = MSTL(season_length=7)
series = series.sample(frac=1.0)
train_df, X_df = mstl_decomposition(series, model, 'D', horizon)

In [None]:
series_pl = generate_series(10, freq='D', engine='polars')
series_pl = series_pl.with_columns(unique_id=pl.col('unique_id').cast(pl.Int64))
train_df_pl, X_df_pl = mstl_decomposition(series_pl, model, '1d', horizon)

In [None]:
pd.testing.assert_series_equal(
    train_df.groupby('unique_id')['ds'].max() + pd.offsets.Day(),
    X_df.groupby('unique_id')['ds'].min()
)
assert X_df.shape[0] == train_df['unique_id'].nunique() * horizon
pd.testing.assert_frame_equal(train_df, train_df_pl.to_pandas())
pd.testing.assert_frame_equal(X_df, X_df_pl.to_pandas())
with_estimate = train_df_pl.with_columns(estimate=pl.col('trend') + pl.col('seasonal'))
assert smape(with_estimate, models=['estimate'])['estimate'].mean() < 0.1

In [None]:
model = MSTL(season_length=[7, 28])
train_df, X_df = mstl_decomposition(series, model, 'D', horizon)
assert train_df.columns.intersection(X_df.columns).tolist() == ['unique_id', 'ds', 'trend', 'seasonal7', 'seasonal28']