In [None]:
#| default_exp grouped_array

In [None]:
#| export
from typing import Sequence, Tuple, Union

import numpy as np
import pandas as pd

from utilsforecast.compat import DataFrame
from utilsforecast.processing import DataFrameProcessor

In [None]:
#| exporti
def _append_one(
    data: np.ndarray,
    indptr: np.ndarray,
    new: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """Append each value of new to each group in data formed by indptr."""
    n_groups = len(indptr) - 1
    rows = data.shape[0] + new.shape[0]
    new_data = np.empty((rows, data.shape[1]), dtype=data.dtype)
    new_indptr = indptr.copy()
    new_indptr[1:] += np.arange(1, n_groups + 1)
    for i in range(n_groups):
        prev_slice = slice(indptr[i], indptr[i + 1])
        new_slice = slice(new_indptr[i], new_indptr[i + 1] - 1)
        new_data[new_slice] = data[prev_slice]
        new_data[new_indptr[i + 1] - 1] = new[i]
    return new_data, new_indptr

In [None]:
# test _append_one
data = np.arange(5).reshape(-1, 1)
indptr = np.array([0, 2, 5])
new = np.array([7, 8])
new_data, new_indptr = _append_one(data, indptr, new)
np.testing.assert_equal(
    new_data,
    np.array([0, 1, 7, 2, 3, 4, 8]).reshape(-1, 1),
)
np.testing.assert_equal(
    new_indptr,
    np.array([0, 3, 7]),
)

In [None]:
#| exporti
def _append_several(
    data: np.ndarray,
    indptr: np.ndarray,
    new_sizes: np.ndarray,
    new_values: np.ndarray,
    new_groups: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray]:
    rows = data.shape[0] + new_values.shape[0]
    new_data = np.empty((rows, data.shape[1]), dtype=data.dtype)
    new_indptr = np.empty(new_sizes.size + 1, dtype=indptr.dtype)
    new_indptr[0] = 0
    old_indptr_idx = 0
    new_vals_idx = 0
    for i, is_new in enumerate(new_groups):
        new_size = new_sizes[i]
        if is_new:
            old_size = 0
        else:
            prev_slice = slice(indptr[old_indptr_idx], indptr[old_indptr_idx + 1])
            old_indptr_idx += 1
            old_size = prev_slice.stop - prev_slice.start
            new_size += old_size
            new_data[new_indptr[i] : new_indptr[i] + old_size] = data[prev_slice]
        new_indptr[i + 1] = new_indptr[i] + new_size
        new_data[new_indptr[i] + old_size : new_indptr[i + 1]] = new_values[
            new_vals_idx : new_vals_idx + new_sizes[i]
        ]
        new_vals_idx += new_sizes[i]
    return new_data, new_indptr

In [None]:
# test append several
data = np.arange(5).reshape(-1, 1)
indptr = np.array([0, 2, 5])
new_sizes = np.array([0, 2, 1])
new_values = np.array([6, 7, 5]).reshape(-1, 1)
new_groups = np.array([False, True, False])
new_data, new_indptr = _append_several(data, indptr, new_sizes, new_values, new_groups)
np.testing.assert_equal(
    new_data,
    np.array([0, 1, 6, 7, 2, 3, 4, 5]).reshape(-1, 1),
)
np.testing.assert_equal(
    new_indptr,
    np.array([0, 2, 4, 8]),
)

In [None]:
#| export
class GroupedArray:
    def __init__(self, data: np.ndarray, indptr: np.ndarray):
        self.data = data
        self.indptr = indptr
        self.n_groups = len(indptr) - 1

    def __len__(self):
        return self.n_groups

    def __getitem__(self, idx: int) -> np.ndarray:
        if idx < 0:
            idx = self.n_groups + idx
        return self.data[self.indptr[idx] : self.indptr[idx + 1]]

    @classmethod
    def from_sorted_df(
        cls, df: DataFrame, id_col: str, time_col: str, target_col: str
    ) -> 'GroupedArray':
        if isinstance(df, pd.DataFrame):
            sizes = df.groupby(id_col, observed=True).size().values
        else:
            try:
                group_sizes = df.group_by(id_col, maintain_order=True).count()
            except AttributeError:
                group_sizes = df.groupby(id_col, maintain_order=True).count()
            sizes = group_sizes['count'].to_numpy()
        
        indptr = np.append(0, sizes.cumsum())
        proc = DataFrameProcessor(id_col, time_col, target_col)
        data = proc._value_cols_to_numpy(df)
        if data.dtype not in (np.float32, np.float64):
            data = data.astype(np.float32)
        return cls(data, indptr)

    def _take_from_ranges(self, ranges: Sequence) -> 'GroupedArray':
        items = [self.data[r] for r in ranges]
        sizes = np.array([item.size for item in items])
        data = np.vstack(items)
        indptr = np.append(0, sizes.cumsum())
        return GroupedArray(data, indptr)        

    def take(self, idxs: Sequence[int]) -> 'GroupedArray':
        """Subset specific groups by their indices."""
        ranges = [range(self.indptr[i], self.indptr[i + 1]) for i in idxs]
        return self._take_from_ranges(ranges)

    def take_from_groups(self, idx: Union[int, slice]) -> 'GroupedArray':
        """Select a subset from each group."""
        ranges = [
            range(self.indptr[i], self.indptr[i + 1])[idx]
            for i in range(self.n_groups)
        ]
        return self._take_from_ranges(ranges)

    def append(self, new: np.ndarray) -> 'GroupedArray':
        """Appends each element of `new` to each existing group. Returns a copy."""
        if new.shape[0] != self.n_groups:
            raise ValueError(f"new must have {self.n_groups} rows.")
        new_data, new_indptr = _append_one(self.data, self.indptr, new)
        return GroupedArray(new_data, new_indptr)

    def append_several(
        self, new_sizes: np.ndarray, new_values: np.ndarray, new_groups: np.ndarray
    ) -> "GroupedArray":
        new_data, new_indptr = _append_several(
            self.data, self.indptr, new_sizes, new_values, new_groups
        )
        return GroupedArray(new_data, new_indptr)

    def __repr__(self):
        return (
            f"{self.__class__.__name__}(n_rows={self.data.shape[0]:,}, n_groups={self.n_groups:,})"
        )

In [None]:
from fastcore.test import test_eq, test_fail

In [None]:
# The `GroupedArray` is used internally for storing the series values and performing transformations.
data = np.arange(10, dtype=np.float32).reshape(-1, 1)
indptr = np.array([0, 2, 10])  # group 1: [0, 1], group 2: [2..9]
ga = GroupedArray(data, indptr)
test_eq(len(ga), 2)

In [None]:
# Iterate through the groups
ga_iter = iter(ga)
np.testing.assert_equal(next(ga_iter), np.array([0, 1]).reshape(-1, 1))
np.testing.assert_equal(next(ga_iter), np.arange(2, 10).reshape(-1, 1))

In [None]:
# Take the last two observations from each group
last_2 = ga.take_from_groups(slice(-2, None))
np.testing.assert_equal(last_2.data, np.array([0, 1, 8, 9]).reshape(-1, 1))
np.testing.assert_equal(last_2.indptr, np.array([0, 2, 4]))

# Take the second observation from each group
second = ga.take_from_groups(1)
np.testing.assert_equal(second.data, np.array([1, 3]).reshape(-1, 1))
np.testing.assert_equal(second.indptr, np.array([0, 1, 2]))

In [None]:
# Take the last four observations from every group. Note that since group 1 only has two elements, only these are returned.
last_4 = ga.take_from_groups(slice(-4, None))
np.testing.assert_equal(last_4.data, np.array([0, 1, 6, 7, 8, 9]).reshape(-1, 1))
np.testing.assert_equal(last_4.indptr, np.array([0, 2, 6]))

In [None]:
# Select a specific subset of groups
indptr = np.array([0, 2, 4, 7, 10])
ga2 = GroupedArray(data, indptr)
subset = ga2.take([0, 2])
np.testing.assert_allclose(subset[0].data, ga2[0].data)
np.testing.assert_allclose(subset[1].data, ga2[2].data)

In [None]:
# try to append new values that don't match the number of groups
test_fail(lambda: ga.append(np.array([1., 2., 3.])), contains='new must have 2 rows')

In [None]:
from utilsforecast.data import generate_series

In [None]:
# build from df
series_pd = generate_series(10, static_as_categorical=False, engine='pandas')
series_pl = generate_series(10, static_as_categorical=False, engine='polars')
ga_pd = GroupedArray.from_sorted_df(series_pd, 'unique_id', 'y')
ga_pl = GroupedArray.from_sorted_df(series_pl, 'unique_id', 'y')
np.testing.assert_allclose(ga_pd.data, ga_pl.data)
np.testing.assert_equal(ga_pd.indptr, ga_pl.indptr)