In [None]:
#| default_exp target_transforms

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# Target transforms
Transformations that can be applied to the target before computing the features and restored after computing the predictions.

In [None]:
#| export
import abc
import reprlib
from typing import Iterable, List, Optional

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin, clone
from numba import njit

from mlforecast.grouped_array import GroupedArray, _apply_difference
from mlforecast.utils import _ensure_shallow_copy

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer

from mlforecast import MLForecast
from mlforecast.utils import generate_daily_series

In [None]:
#| export
class BaseTargetTransform(abc.ABC):
    """Base class used for target transformations."""
    idxs: Optional[np.ndarray] = None
    
    def set_column_names(self, id_col: str, time_col: str, target_col: str):
        self.id_col = id_col
        self.time_col = time_col
        self.target_col = target_col

    @abc.abstractmethod
    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        raise NotImplementedError
        
    @abc.abstractmethod
    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        raise NotImplementedError

In [None]:
#| export
class Differences(BaseTargetTransform):
    """Subtracts previous values of the serie. Can be used to remove trend or seasonalities."""
    store_fitted = False
    
    def __init__(self, differences: Iterable[int]):
        self.differences = list(differences)

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        self.fitted_: List[GroupedArray] = []
        ga = GroupedArray.from_sorted_df(df, self.id_col, self.target_col)
        uids = df[self.id_col].unique()  
        original_sizes = np.diff(ga.indptr)
        total_diffs = sum(self.differences)
        small_series = uids[original_sizes < total_diffs]
        if small_series.size:
            msg = reprlib.repr(small_series.tolist())
            raise ValueError(f'The following series are too short for the differences: {msg}')
        self.original_values_ = []
        n_series = len(ga.indptr) - 1
        for d in self.differences:
            if self.store_fitted:
                # these are saved in order to be able to perform a correct
                # inverse transform when trying to retrieve the fitted values.
                self.fitted_.append(GroupedArray(ga.data.copy(), ga.indptr.copy()))
            new_data = np.empty_like(ga.data, shape=n_series * d)
            new_indptr = d * np.arange(n_series + 1, dtype=np.int32)
            _apply_difference(ga.data, ga.indptr, new_data, new_indptr, d)
            self.original_values_.append(GroupedArray(new_data, new_indptr))
        df = df.copy(deep=False)
        df = _ensure_shallow_copy(df)
        df[self.target_col] = ga.data
        return df

    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        model_cols = df.columns.drop([self.id_col, self.time_col])
        df = df.copy(deep=False)
        df = _ensure_shallow_copy(df)
        for model in model_cols:
            model_preds = df[model].values.copy()
            if self.fitted_:
                sizes = df[self.id_col].value_counts().sort_index().values
                indptr = np.append(0, sizes.cumsum())
                for d, ga in zip(reversed(self.differences), reversed(self.fitted_)):
                    ga.restore_fitted_difference(model_preds, indptr, d)
            else:
                for d, ga in zip(reversed(self.differences), reversed(self.original_values_)):
                    if self.idxs is not None:
                        ga = ga.take(self.idxs)
                    ga.restore_difference(model_preds, d)
            df[model] = model_preds
        return df

In [None]:
series = generate_daily_series(10, min_length=50, max_length=50)

In [None]:
diffs = Differences([1, 2, 5])
diffs.set_column_names('unique_id', 'ds', 'y')

# differences are applied correctly
transformed = diffs.fit_transform(series)
assert diffs.fitted_ == []
expected = series.copy()
for d in diffs.differences:
    expected['y'] -= expected.groupby('unique_id')['y'].shift(d)
pd.testing.assert_frame_equal(transformed, expected)

# fitted differences are restored correctly
diffs.store_fitted = True
transformed = diffs.fit_transform(series)
keep_mask = transformed['y'].notnull()
restored = diffs.inverse_transform(transformed)
pd.testing.assert_frame_equal(series[keep_mask], restored[keep_mask])
restored_subs = diffs.inverse_transform(transformed[keep_mask])
pd.testing.assert_frame_equal(series[keep_mask], restored_subs)

In [None]:
#| exporti
@njit
def _standard_scaler_transform(data, indptr, stats, out):
    n_series = len(indptr) - 1
    for i in range(n_series):
        sl = slice(indptr[i], indptr[i + 1])
        subs = data[sl]
        mean_ = np.nanmean(subs)
        std_ = np.nanstd(subs)
        stats[i] = mean_, std_
        out[sl] = (data[sl] - mean_) / std_

@njit
def _standard_scaler_inverse_transform(preds, stats):
    n_series = stats.shape[0]
    h = preds.size // n_series
    k = 0
    for i in range(n_series):
        mean_, std_ = stats[i]
        for _ in range(h):
            preds[k] = preds[k] * std_ + mean_
            k += 1

In [None]:
#| export
class LocalStandardScaler(BaseTargetTransform):
    """Standardizes each serie by subtracting its mean and dividing by its standard deviation."""
    
    def fit_transform(self, df: 'pd.DataFrame') -> 'pd.DataFrame':
        ga = GroupedArray.from_sorted_df(df, self.id_col, self.target_col)
        self.stats_ = np.empty((len(ga.indptr) - 1, 2))        
        out = np.empty_like(ga.data)
        _standard_scaler_transform(ga.data, ga.indptr, self.stats_, out)
        df = df.copy(deep=False)
        df[self.target_col] = out
        return df

    def inverse_transform(self, df: 'pd.DataFrame') -> 'pd.DataFrame':        
        df = df.copy(deep=False)
        model_cols = df.columns.drop([self.id_col, self.time_col])
        stats = self.stats_ if self.idxs is None else self.stats_[self.idxs]
        for model in model_cols:
            model_preds = df[model].values
            _standard_scaler_inverse_transform(model_preds, stats)
            df[model] = model_preds
        return df

In [None]:
sc = LocalStandardScaler()
sc.set_column_names('unique_id', 'ds', 'y')
pd.testing.assert_frame_equal(
    sc.inverse_transform(sc.fit_transform(series)),
    series,
)
subset = series[series['unique_id'].isin(['id_0', 'id_7'])]
sc.idxs = [0, 7]
pd.testing.assert_frame_equal(
    sc.inverse_transform(subset),
    subset
)

In [None]:
#| export
class GlobalSklearnTransformer(BaseTargetTransform):
    """Applies the same scikit-learn transformer to all series."""    
    def __init__(self, transformer: TransformerMixin):
        self.transformer = transformer

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy(deep=False)
        self.transformer_ = clone(self.transformer)
        df[self.target_col] = self.transformer_.fit_transform(df[[self.target_col]].values)
        return df

    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy(deep=False)
        cols_to_transform = df.columns.drop([self.id_col, self.time_col])
        for col in cols_to_transform:
            df[col] = self.transformer_.inverse_transform(df[[col]].values)
        return df

In [None]:
sk_boxcox = PowerTransformer(method='box-cox', standardize=False)
boxcox_global = GlobalSklearnTransformer(sk_boxcox)
single_difference = Differences([1])
series = generate_daily_series(10)
fcst = MLForecast(
    models=[LinearRegression()],
    freq='D',
    lags=[1, 2],
    target_transforms=[boxcox_global, single_difference]
)
prep = fcst.preprocess(series, dropna=False)
expected = (
    pd.Series(
        sk_boxcox.fit_transform(series[['y']])[:, 0], index=series['unique_id']
    ).groupby('unique_id')
    .diff()
    .values
)
np.testing.assert_allclose(prep['y'].values, expected)