In [None]:
#| default_exp target_transforms

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# Target transforms
Transformations that can be applied to the target before computing the features and restored after computing the predictions.

In [None]:
#| export
import abc
import copy
from typing import Iterable, List, Optional

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin, clone
from utilsforecast.compat import DataFrame
from utilsforecast.target_transforms import (
    LocalBoxCox as BoxCox,
    LocalMinMaxScaler as MinMaxScaler,
    LocalRobustScaler as RobustScaler,
    LocalStandardScaler as StandardScaler,
    _common_scaler_inverse_transform,
    _transform,
)

from mlforecast.grouped_array import GroupedArray, _apply_difference
from mlforecast.utils import _ShortSeriesException

In [None]:
import pandas as pd
from fastcore.test import test_fail
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer
from utilsforecast.processing import counts_by_id

from mlforecast import MLForecast
from mlforecast.utils import generate_daily_series

In [None]:
#| export
class BaseTargetTransform(abc.ABC):
    """Base class used for target transformations."""
    def set_column_names(self, id_col: str, time_col: str, target_col: str):
        self.id_col = id_col
        self.time_col = time_col
        self.target_col = target_col

    def update(self, df: DataFrame) -> DataFrame:
        raise NotImplementedError

    @abc.abstractmethod
    def fit_transform(self, df: DataFrame) -> DataFrame:
        ...
        
    @abc.abstractmethod
    def inverse_transform(self, df: DataFrame) -> DataFrame:
        ...

In [None]:
#| export
class BaseGroupedArrayTargetTransform(abc.ABC):
    """Base class used for target transformations that operate on grouped arrays."""
    idxs: Optional[np.ndarray] = None

    @abc.abstractmethod
    def update(self, ga: GroupedArray) -> GroupedArray:
        ...
    
    @abc.abstractmethod
    def fit_transform(self, ga: GroupedArray) -> GroupedArray:
        ...
        
    @abc.abstractmethod
    def inverse_transform(self, ga: GroupedArray) -> GroupedArray:
        ...

    def inverse_transform_fitted(self, ga: GroupedArray) -> GroupedArray:
        return self.inverse_transform(ga)

In [None]:
#| export
class Differences(BaseGroupedArrayTargetTransform):
    """Subtracts previous values of the serie. Can be used to remove trend or seasonalities."""
    store_fitted = False
    
    def __init__(self, differences: Iterable[int]):
        self.differences = list(differences)

    def fit_transform(self, ga: GroupedArray) -> GroupedArray:
        ga = copy.copy(ga)
        self.fitted_: List[GroupedArray] = []
        original_sizes = np.diff(ga.indptr)
        total_diffs = sum(self.differences)
        small_series = original_sizes < total_diffs
        if small_series.any():
            raise _ShortSeriesException(np.arange(ga.n_groups)[small_series])
        self.original_values_ = []
        n_series = len(ga.indptr) - 1
        for d in self.differences:
            if self.store_fitted:
                # these are saved in order to be able to perform a correct
                # inverse transform when trying to retrieve the fitted values.
                self.fitted_.append(copy.copy(ga))
            new_data = np.empty_like(ga.data, shape=n_series * d)
            new_indptr = d * np.arange(n_series + 1, dtype=np.int32)
            _apply_difference(ga.data, ga.indptr, new_data, new_indptr, d)
            self.original_values_.append(GroupedArray(new_data, new_indptr))
        return ga

    def update(self, ga: GroupedArray) -> GroupedArray:
        transformed = copy.copy(ga)
        for d, orig_ga in zip(self.differences, self.original_values_):
            orig_ga.update_difference(d, transformed)
        return transformed

    def inverse_transform(self, ga: GroupedArray) -> GroupedArray:
        ga = copy.copy(ga)
        for d, orig_vals_ga in zip(reversed(self.differences), reversed(self.original_values_)):
            if self.idxs is not None:
                orig_vals_ga = orig_vals_ga.take(self.idxs)
            orig_vals_ga.restore_difference(ga.data, d)
        return ga

    def inverse_transform_fitted(self, ga: GroupedArray) -> GroupedArray:
        ga = copy.copy(ga)
        for d, fitted in zip(reversed(self.differences), reversed(self.fitted_)):
            if self.idxs is not None:
                fitted = fitted.take(self.idxs)
            fitted.restore_fitted_difference(ga.data, ga.indptr, d)
        return ga

In [None]:
series = generate_daily_series(10, min_length=50, max_length=100)

In [None]:
diffs = Differences([1, 2, 5])
id_counts = counts_by_id(series, 'unique_id')
indptr = np.append(0, id_counts['counts'].cumsum())
ga = GroupedArray(series['y'].values, indptr)

# differences are applied correctly
transformed = diffs.fit_transform(ga)
assert diffs.fitted_ == []
expected = series.copy()
for d in diffs.differences:
    expected['y'] -= expected.groupby('unique_id')['y'].shift(d)
np.testing.assert_allclose(transformed.data, expected['y'].values)

# fitted differences are restored correctly
diffs.store_fitted = True
transformed = diffs.fit_transform(ga)
keep_mask = ~np.isnan(transformed.data)
restored = diffs.inverse_transform_fitted(transformed)
np.testing.assert_allclose(ga.data[keep_mask], restored.data[keep_mask])
restored_subs = diffs.inverse_transform_fitted(transformed.take_from_groups(slice(8, None)))
np.testing.assert_allclose(ga.data[keep_mask], restored_subs.data)

# test transform
new_ga = GroupedArray(np.random.rand(10), np.arange(11))
prev_orig = [diffs.original_values_[i].data[::d].copy() for i, d in enumerate(diffs.differences)]
expected = new_ga.data - np.add.reduce(prev_orig)
updates = diffs.update(new_ga)
np.testing.assert_allclose(expected, updates.data)
np.testing.assert_allclose(diffs.original_values_[0].data, new_ga.data)
np.testing.assert_allclose(diffs.original_values_[1].data[1::2], new_ga.data - prev_orig[0])
np.testing.assert_allclose(diffs.original_values_[2].data[4::5], new_ga.data - np.add.reduce(prev_orig[:2]))
# variable sizes
diff1 = Differences([1])
ga = GroupedArray(np.arange(10), np.array([0, 3, 10]))
diff1.fit_transform(ga)
new_ga = GroupedArray(np.arange(4), np.array([0, 1, 4]))
updates = diff1.update(new_ga)
np.testing.assert_allclose(updates.data, np.array([0 - 2, 1 - 9, 2 - 1, 3 - 2]))
np.testing.assert_allclose(diff1.original_values_[0].data, np.array([0, 3]))

# short series
ga = GroupedArray(np.arange(20), np.array([0, 2, 20]))
test_fail(lambda: diffs.fit_transform(ga), contains="[0]")

In [None]:
#| exporti
class BaseLocalScaler(BaseGroupedArrayTargetTransform):
    scaler_factory: type

    def update(self, ga: GroupedArray) -> GroupedArray:
        return GroupedArray(self.scaler_.transform(ga), ga.indptr)
    
    def fit_transform(self, ga: GroupedArray) -> GroupedArray:
        self.scaler_ = self.scaler_factory()
        transformed = self.scaler_.fit_transform(ga)
        return GroupedArray(transformed, ga.indptr)

    def inverse_transform(self, ga: GroupedArray) -> GroupedArray:
        stats = self.scaler_.stats_
        if self.idxs is not None:
            stats = stats[self.idxs]
        if stats.shape[0] != ga.n_groups:
            raise ValueError('Found different number of groups in scaler.')
        transformed = _transform(ga.data, ga.indptr, stats, _common_scaler_inverse_transform)
        return GroupedArray(transformed, ga.indptr)

    def inverse_transform_fitted(self, ga: GroupedArray) -> GroupedArray:
        return self.inverse_transform(ga)

In [None]:
def test_scaler(sc, series):
    id_counts = counts_by_id(series, 'unique_id')
    indptr = np.append(0, id_counts['counts'].cumsum())
    ga = GroupedArray(series['y'].values, indptr)
    transformed = sc.fit_transform(ga)
    np.testing.assert_allclose(
        sc.inverse_transform(transformed).data,
        ga.data,
    )
    transformed2 = sc.update(ga)
    np.testing.assert_allclose(transformed.data, transformed2.data)
    
    idxs = [0, 7]
    subset = ga.take(idxs)
    transformed_subset = transformed.take(idxs)
    sc.idxs = idxs
    np.testing.assert_allclose(
        sc.inverse_transform(transformed_subset).data,
        subset.data,
    )    

In [None]:
#| export
class LocalStandardScaler(BaseLocalScaler):
    """Standardizes each serie by subtracting its mean and dividing by its standard deviation."""
    scaler_factory = StandardScaler

In [None]:
test_scaler(LocalStandardScaler(), series)

In [None]:
#| export
class LocalMinMaxScaler(BaseLocalScaler):
    """Scales each serie to be in the [0, 1] interval."""
    scaler_factory = MinMaxScaler

In [None]:
test_scaler(LocalMinMaxScaler(), series)

In [None]:
#| export
class LocalRobustScaler(BaseLocalScaler):
    """Scaler robust to outliers.

    Parameters
    ----------
    scale : str (default='iqr')
        Statistic to use for scaling. Can be either 'iqr' (Inter Quartile Range) or 'mad' (Median Asbolute Deviation)
    """

    def __init__(self, scale: str):
        self.scaler_factory = lambda: RobustScaler(scale)  # type: ignore

In [None]:
test_scaler(LocalRobustScaler(scale='iqr'), series)

In [None]:
test_scaler(LocalRobustScaler(scale='mad'), series)

In [None]:
#| export
class LocalBoxCox(BaseLocalScaler):
    """Finds the optimum lambda for each serie and applies the Box-Cox transformation"""
    def __init__(self):
        self.scaler_ = BoxCox()
    
    def fit_transform(self, ga: GroupedArray) -> GroupedArray:
        return GroupedArray(self.scaler_.fit_transform(ga), ga.indptr)

    def inverse_transform(self, ga: GroupedArray) -> GroupedArray:
        from scipy.special import inv_boxcox1p

        sizes = np.diff(ga.indptr)
        lmbdas = self.scaler_.lmbdas_
        if self.idxs is not None:
            lmbdas = lmbdas[self.idxs]
        lmbdas = np.repeat(lmbdas, sizes, axis=0)
        return GroupedArray(inv_boxcox1p(ga.data, lmbdas), ga.indptr)

In [None]:
test_scaler(LocalBoxCox(), series)

In [None]:
#| export
class GlobalSklearnTransformer(BaseTargetTransform):
    """Applies the same scikit-learn transformer to all series."""    
    def __init__(self, transformer: TransformerMixin):
        self.transformer = transformer

    def update(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy(deep=False)
        df[self.target_col] = self.transformer_.transform(df[[self.target_col]].values)
        return df

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy(deep=False)
        self.transformer_ = clone(self.transformer)
        df[self.target_col] = self.transformer_.fit_transform(df[[self.target_col]].values)
        return df

    def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy(deep=False)
        cols_to_transform = df.columns.drop([self.id_col, self.time_col])
        for col in cols_to_transform:
            df[col] = self.transformer_.inverse_transform(df[[col]].values)
        return df

In [None]:
# need this import in order for isinstance to work
from mlforecast.target_transforms import Differences as ExportedDifferences

In [None]:
sk_boxcox = PowerTransformer(method='box-cox', standardize=False)
boxcox_global = GlobalSklearnTransformer(sk_boxcox)
single_difference = ExportedDifferences([1])
series = generate_daily_series(10)
fcst = MLForecast(
    models=[LinearRegression()],
    freq='D',
    lags=[1, 2],
    target_transforms=[boxcox_global, single_difference]
)
prep = fcst.preprocess(series, dropna=False)
expected = (
    pd.Series(
        sk_boxcox.fit_transform(series[['y']])[:, 0], index=series['unique_id']
    ).groupby('unique_id')
    .diff()
    .values
)
np.testing.assert_allclose(prep['y'].values, expected)