In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp processing

# Processing
> Internal DataFrame processing

In [None]:
#| export
import numpy as np
import pandas as pd

from utilsforecast.compat import DataFrame, pl_Series
from utilsforecast.grouped_array import GroupedArray

In [None]:
#| exporti
def _polars_serie_to_double(serie: pl_Series) -> pl_Series:
    import polars as pl

    if serie.dtype == pl.Categorical:
        serie = serie.cast(pl.Utf8)
    return serie.cast(pl.Float64)

def _counts_by_id(df: DataFrame, id_col: str) -> DataFrame:
    id_counts = df[id_col].value_counts()
    if isinstance(id_counts, pd.Series):
        id_counts = id_counts.rename('counts').sort_index().reset_index()
    else:
        id_counts = id_counts.sort(id_col)
    return id_counts

def _value_cols_to_numpy(df: DataFrame, id_col: str, time_col: str, target_col: str) -> np.ndarray:
    value_cols = [col for col in df.columns if col not in (id_col, time_col, target_col)]
    # ensure target is the first column
    value_cols = [target_col] + value_cols
    if isinstance(df, pd.DataFrame):
        data = df[value_cols].to_numpy()
    else:
        import polars as pl

        data = df[value_cols].select(pl.all().map(_polars_serie_to_double)).to_numpy()
    return data

def _compute_sort_idxs(df: DataFrame, idx: pd.MultiIndex) -> np.ndarray:
    if isinstance(df, pd.DataFrame):
        sort_idxs = idx.argsort()
    else:
        import polars as pl

        sort_idxs = df.select(
            pl.arg_sort_by(idx.names).alias('idx')
        )['idx'].to_numpy()
    return sort_idxs

In [None]:
#| export
class DataFrameProcessing:
    def __init__(
        self,
        id_col: str = 'unique_id',
        time_col: str = 'ds',
        target_col: str = 'y',
    ):
        self.id_col = id_col
        self.time_col = time_col
        self.target_col = target_col

    def process(self, df: DataFrame) -> None:
        times = df[self.time_col].to_numpy()

        # ids
        uids = df[self.id_col].to_numpy()
        id_counts = _counts_by_id(df, self.id_col)
        self.uids = id_counts[self.id_col]

        # indices
        indptr = np.append(
            np.int64(0),
            id_counts['counts'].to_numpy().cumsum().astype(np.int64),
        )
        last_idxs = indptr[1:] - 1        

        # data
        data = _value_cols_to_numpy(df, self.id_col, self.time_col, self.target_col)
        # ensure float dtype
        if data.dtype not in (np.float32, np.float64):
            data = data.astype(np.float32)
        # ensure 2dim
        if data.ndim == 1:
            data = data.reshape(-1, 1)

        # check if we need to sort
        idx = pd.MultiIndex.from_arrays([uids, times], names=[self.id_col, self.time_col])
        if not idx.is_monotonic_increasing:
            sort_idxs = _compute_sort_idxs(df, idx)
            data = data[sort_idxs]
            last_idxs = sort_idxs[last_idxs]
        self.ga = GroupedArray(data, indptr)
        self.times = times[last_idxs]

In [None]:
import polars as pl
from fastcore.test import test_eq

from utilsforecast.data import generate_series

In [None]:
series_pd = generate_series(10_000, n_static_features=2, equal_ends=False, engine='pandas')
scrambled_series_pd = series_pd.sample(frac=1.0)
dfp = DataFrameProcessing('D', 'unique_id', 'ds', 'y')
dfp.process(scrambled_series_pd)
test_eq(dfp.times, series_pd.groupby('unique_id')['ds'].max().values)
test_eq(dfp.uids, np.sort(series_pd['unique_id'].unique()))
test_eq(dfp.ga.data, series_pd[['y', 'static_0', 'static_1']].to_numpy())
test_eq(np.diff(dfp.ga.indptr), series_pd.groupby('unique_id').size().values)

In [None]:
series_pl = generate_series(10_000, n_static_features=2, equal_ends=False, engine='polars')
scrambled_series_pl = series_pl.sample(fraction=1.0)
dfp = DataFrameProcessing('D', 'unique_id', 'ds', 'y')
dfp.process(scrambled_series_pl)
test_eq(dfp.times, series_pl.groupby('unique_id').agg(pl.col('ds').max())['ds'].to_numpy())
test_eq(dfp.uids, series_pl['unique_id'].unique().sort())
test_eq(dfp.ga.data, series_pd[['y', 'static_0', 'static_1']].to_numpy())
test_eq(np.diff(dfp.ga.indptr), series_pl.groupby('unique_id').count()['count'].to_numpy())