In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp feature_engineering

# Feature engineering
> Create exogenous regressors for your models

In [None]:
#| export
from functools import partial
from typing import Callable, List, Tuple

import numpy as np

import utilsforecast.processing as ufp
from utilsforecast.compat import DataFrame
from utilsforecast.validation import validate_format, validate_freq

In [None]:
#| exporti
_Features = Tuple[List[str], np.ndarray, np.ndarray]

def _add_features(
    df: DataFrame,
    freq: str,
    h: int,
    id_col: str,
    time_col: str,
    f: Callable[[np.ndarray, int], _Features],
) -> Tuple[DataFrame, DataFrame]:
    # validations
    if not isinstance(h, int) or h < 0:
        raise ValueError('`h` must be a non-negative integer')
    validate_format(df, id_col, time_col, None)
    validate_freq(df[time_col], freq)

    # decompose series
    id_counts = ufp.counts_by_id(df, id_col)
    uids = id_counts[id_col]
    sizes = id_counts['counts'].to_numpy()

    # compute values
    cols, vals, future_vals = f(sizes=sizes, h=h)  # type: ignore

    # assign back to df
    sort_idxs = ufp.maybe_compute_sort_indices(df, id_col, time_col)
    times = df[time_col]
    if sort_idxs is not None:
        restore_idxs = np.empty_like(sort_idxs)
        restore_idxs[sort_idxs] = np.arange(sort_idxs.size)
        vals = vals[restore_idxs]
        times = ufp.take_rows(times, sort_idxs)
    last_times = ufp.take_rows(times, sizes.cumsum() - 1)
    df = ufp.copy_if_pandas(df, deep=False)
    transformed = ufp.assign_columns(df, cols, vals)

    if h == 0:
        return transformed, type(df)({})

    # future vals
    future_df = ufp.make_future_dataframe(
        uids=uids,
        last_times=last_times,
        freq=freq,
        h=h,
        id_col=id_col,
        time_col=time_col,
    )
    future_df = ufp.assign_columns(future_df, cols, future_vals)
    return transformed, future_df

def _assign_slices(
    sizes: np.ndarray,
    feats: np.ndarray,
    h: int,
) -> Tuple[np.ndarray, np.ndarray]:
    n_feats = feats.shape[1]
    vals = np.empty((sizes.sum(), n_feats), dtype=np.float32)
    future_vals = np.empty((h * sizes.size, n_feats))
    start = 0
    for i, size in enumerate(sizes):
        vals[start : start + size, :] = feats[-(size + h): -h]
        future_vals[i * h: (i + 1) * h] = feats[-h:]
        start += size
    return vals, future_vals

def _fourier(
    sizes: np.ndarray,
    h: int,
    season_length: int,
    k: int,
) -> _Features:
    # taken from: https://github.com/tblume1992/TSUtilities/blob/main/TSUtilities/TSFeatures/fourier_seasonality.py
    x = 2 * np.pi * np.arange(1, k + 1) / season_length
    x = x.astype(np.float32)
    t = np.arange(1, sizes.max() + 1 + h, dtype=np.float32)
    x = x * t[:, None]
    terms = np.hstack([np.sin(x), np.cos(x)])
    cols = [f'{op}{i+1}_{season_length}' for op in ('sin', 'cos') for i in range(k)]
    vals, future_vals = _assign_slices(sizes=sizes, feats=terms, h=h)
    return cols, vals, future_vals

def _trend(sizes: np.ndarray, h: int) -> _Features:
    t = np.arange(1, sizes.max() + 1 + h, dtype=np.float32).reshape(-1, 1)
    cols = ['trend']
    vals, future_vals = _assign_slices(sizes=sizes, feats=t, h=h)
    return cols, vals, future_vals

In [None]:
#| export
def fourier(
    df: DataFrame,
    freq: str,
    season_length: int,
    k: int,
    h: int = 0,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
) -> Tuple[DataFrame, DataFrame]:
    """Compute fourier seasonal terms for training and forecasting

    Parameters
    ----------
    df : pandas or polars DataFrame
        Dataframe with ids, times and values for the exogenous regressors.
    freq : str or int
        Frequency of the data. Must be a valid pandas or polars offset alias, or an integer.
    season_length : int
        Number of observations per unit of time. Ex: 24 Hourly data.
    k : int
        Maximum order of the fourier terms
    h : int (default=0)
        Forecast horizon.        
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestep, its values can be timestamps or integers.

    Returns
    -------
    transformed_df : pandas or polars DataFrame
        Original DataFrame with the computed features
    future_df : pandas or polars DataFrame
        DataFrame with future values
    """
    f = partial(_fourier, season_length=season_length, k=k)
    return _add_features(
        df=df,
        freq=freq,
        h=h,
        id_col=id_col,
        time_col=time_col,
        f=f,
    )

In [None]:
import pandas as pd

from utilsforecast.data import generate_series

In [None]:
series = generate_series(5, equal_ends=True)
transformed_df, future_df = fourier(series, freq='D', season_length=7, k=2, h=1)
transformed_df

Unnamed: 0,unique_id,ds,y,sin1_7,sin2_7,cos1_7,cos2_7
0,0,2000-10-05,0.428973,-0.974927,0.433894,-0.222526,-0.900964
1,0,2000-10-06,1.423626,-0.781835,-0.974926,0.623486,-0.222531
2,0,2000-10-07,2.311782,-0.000005,-0.000009,1.000000,1.000000
3,0,2000-10-08,3.192191,0.781829,0.974930,0.623493,-0.222512
4,0,2000-10-09,4.148767,0.974929,-0.433877,-0.222517,-0.900972
...,...,...,...,...,...,...,...
1096,4,2001-05-10,4.058910,-0.974927,0.433888,-0.222523,-0.900967
1097,4,2001-05-11,5.178157,-0.781823,-0.974934,0.623500,-0.222495
1098,4,2001-05-12,6.133142,-0.000002,-0.000003,1.000000,1.000000
1099,4,2001-05-13,0.403709,0.781840,0.974922,0.623479,-0.222548


In [None]:
future_df

Unnamed: 0,unique_id,ds,sin1_7,sin2_7,cos1_7,cos2_7
0,0,2001-05-15,0.433871,-0.781813,-0.900975,0.623513
1,1,2001-05-15,0.433871,-0.781813,-0.900975,0.623513
2,2,2001-05-15,0.433871,-0.781813,-0.900975,0.623513
3,3,2001-05-15,0.433871,-0.781813,-0.900975,0.623513
4,4,2001-05-15,0.433871,-0.781813,-0.900975,0.623513


In [None]:
#| hide
transformed_df2, future_df2 = fourier(series.sample(frac=1.0), freq='D', season_length=7, k=2, h=1)
pd.testing.assert_frame_equal(
    transformed_df,
    transformed_df2.sort_values(['unique_id', 'ds']).reset_index(drop=True)
)
pd.testing.assert_frame_equal(future_df, future_df2)

In [None]:
#| hide
#| polars
import polars as pl
import polars.testing

In [None]:
#| hide
#| polars
series_pl = generate_series(5, equal_ends=True, engine='polars')
transformed_pl, future_pl = fourier(series_pl, freq='1d', season_length=7, k=2, h=1)
transformed_pl2, future_pl2 = fourier(series_pl.sample(fraction=1.0), freq='1d', season_length=7, k=2, h=1)
pl.testing.assert_frame_equal(transformed_pl, transformed_pl2)
pl.testing.assert_frame_equal(future_pl, future_pl2)
pd.testing.assert_frame_equal(
    transformed_df.drop(columns=['unique_id']),
    transformed_pl.drop(columns=['unique_id']).to_pandas()
)
pd.testing.assert_frame_equal(
    future_df.drop(columns=['unique_id']),
    future_pl.drop(columns=['unique_id']).to_pandas()
)

In [None]:
#| export
def trend(
    df: DataFrame,
    freq: str,
    h: int = 0,
    id_col: str = 'unique_id',
    time_col: str = 'ds',
) -> Tuple[DataFrame, DataFrame]:
    """Compute fourier seasonal terms for training and forecasting

    Parameters
    ----------
    df : pandas or polars DataFrame
        Dataframe with ids, times and values for the exogenous regressors.
    freq : str or int
        Frequency of the data. Must be a valid pandas or polars offset alias, or an integer.
    h : int (default=0)
        Forecast horizon.        
    id_col : str (default='unique_id')
        Column that identifies each serie.
    time_col : str (default='ds')
        Column that identifies each timestep, its values can be timestamps or integers.

    Returns
    -------
    transformed_df : pandas or polars DataFrame
        Original DataFrame with the computed features
    future_df : pandas or polars DataFrame
        DataFrame with future values
    """
    return _add_features(
        df=df,
        freq=freq,
        h=h,
        id_col=id_col,
        time_col=time_col,
        f=_trend,
    )

In [None]:
series = generate_series(5, equal_ends=True)
transformed_df, future_df = trend(series, freq='D', h=1)
transformed_df

Unnamed: 0,unique_id,ds,y,trend
0,0,2000-10-05,0.428973,152.0
1,0,2000-10-06,1.423626,153.0
2,0,2000-10-07,2.311782,154.0
3,0,2000-10-08,3.192191,155.0
4,0,2000-10-09,4.148767,156.0
...,...,...,...,...
1096,4,2001-05-10,4.058910,369.0
1097,4,2001-05-11,5.178157,370.0
1098,4,2001-05-12,6.133142,371.0
1099,4,2001-05-13,0.403709,372.0


In [None]:
future_df

Unnamed: 0,unique_id,ds,trend
0,0,2001-05-15,374.0
1,1,2001-05-15,374.0
2,2,2001-05-15,374.0
3,3,2001-05-15,374.0
4,4,2001-05-15,374.0
