In [None]:
#| default_exp grouped_array

In [None]:
#| export
from typing import Sequence, Tuple, Union

import numpy as np

from utilsforecast.compat import DataFrame
from utilsforecast.processing import counts_by_id, value_cols_to_numpy

In [None]:
#| exporti
def _append_one(
    data: np.ndarray,
    indptr: np.ndarray,
    new: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    """Append each value of new to each group in data formed by indptr."""
    n_groups = len(indptr) - 1
    n_rows = data.shape[0] + new.shape[0]
    if data.ndim == 2:
        new_data = np.empty_like(data, shape=(n_rows, data.shape[1]))
    else:
        new_data = np.empty_like(data, shape=n_rows)
    new_indptr = indptr.copy()
    new_indptr[1:] += np.arange(1, n_groups + 1)
    for i in range(n_groups):
        prev_slice = slice(indptr[i], indptr[i + 1])
        new_slice = slice(new_indptr[i], new_indptr[i + 1] - 1)
        new_data[new_slice] = data[prev_slice]
        new_data[new_indptr[i + 1] - 1] = new[i]
    return new_data, new_indptr

In [None]:
# test _append_one
data = np.arange(5)
indptr = np.array([0, 2, 5])
new = np.array([7, 8])
new_data, new_indptr = _append_one(data, indptr, new)
np.testing.assert_equal(
    new_data,
    np.array([0, 1, 7, 2, 3, 4, 8])
)
np.testing.assert_equal(
    new_indptr,
    np.array([0, 3, 7]),
)

# 2d
data = np.arange(5).reshape(-1, 1)
new_data, new_indptr = _append_one(data, indptr, new)
np.testing.assert_equal(
    new_data,
    np.array([0, 1, 7, 2, 3, 4, 8]).reshape(-1, 1)
)
np.testing.assert_equal(
    new_indptr,
    np.array([0, 3, 7]),
)

In [None]:
#| exporti
def _append_several(
    data: np.ndarray,
    indptr: np.ndarray,
    new_sizes: np.ndarray,
    new_values: np.ndarray,
    new_groups: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray]:
    n_rows = data.shape[0] + new_values.shape[0]
    if data.ndim == 2:
        new_data = np.empty_like(data, shape=(n_rows, data.shape[1]))
    else:
        new_data = np.empty_like(data, shape=n_rows)
    new_indptr = np.empty_like(indptr, shape=new_sizes.size + 1)
    new_indptr[0] = 0
    old_indptr_idx = 0
    new_vals_idx = 0
    for i, is_new in enumerate(new_groups):
        new_size = new_sizes[i]
        if is_new:
            old_size = 0
        else:
            prev_slice = slice(indptr[old_indptr_idx], indptr[old_indptr_idx + 1])
            old_indptr_idx += 1
            old_size = prev_slice.stop - prev_slice.start
            new_size += old_size
            new_data[new_indptr[i] : new_indptr[i] + old_size] = data[prev_slice]
        new_indptr[i + 1] = new_indptr[i] + new_size
        new_data[new_indptr[i] + old_size : new_indptr[i + 1]] = new_values[
            new_vals_idx : new_vals_idx + new_sizes[i]
        ]
        new_vals_idx += new_sizes[i]
    return new_data, new_indptr

In [None]:
# test append several
data = np.arange(5)
indptr = np.array([0, 2, 5])
new_sizes = np.array([0, 2, 1])
new_values = np.array([6, 7, 5])
new_groups = np.array([False, True, False])
new_data, new_indptr = _append_several(data, indptr, new_sizes, new_values, new_groups)
np.testing.assert_equal(
    new_data,
    np.array([0, 1, 6, 7, 2, 3, 4, 5])
)
np.testing.assert_equal(
    new_indptr,
    np.array([0, 2, 4, 8]),
)

# 2d
data = np.arange(5).reshape(-1, 1)
indptr = np.array([0, 2, 5])
new_sizes = np.array([0, 2, 1])
new_values = np.array([6, 7, 5]).reshape(-1, 1)
new_groups = np.array([False, True, False])
new_data, new_indptr = _append_several(data, indptr, new_sizes, new_values, new_groups)
np.testing.assert_equal(
    new_data,
    np.array([0, 1, 6, 7, 2, 3, 4, 5]).reshape(-1, 1)
)
np.testing.assert_equal(
    new_indptr,
    np.array([0, 2, 4, 8]),
)

In [None]:
#| export
class GroupedArray:
    def __init__(self, data: np.ndarray, indptr: np.ndarray):
        self.data = data
        self.indptr = indptr
        self.n_groups = len(indptr) - 1

    def __len__(self):
        return self.n_groups

    def __getitem__(self, idx: int) -> np.ndarray:
        if idx < 0:
            idx = self.n_groups + idx
        return self.data[self.indptr[idx] : self.indptr[idx + 1]]

    @classmethod
    def from_sorted_df(
        cls, df: DataFrame, id_col: str, time_col: str, target_col: str
    ) -> 'GroupedArray':
        id_counts = counts_by_id(df, id_col)
        sizes = id_counts['counts'].to_numpy()
        indptr = np.append(0, sizes.cumsum())
        data = value_cols_to_numpy(df, id_col, time_col, target_col)
        if data.dtype not in (np.float32, np.float64):
            data = data.astype(np.float32)
        return cls(data, indptr)

    def _take_from_ranges(self, ranges: Sequence) -> Tuple[np.ndarray, np.ndarray]:
        items = [self.data[r] for r in ranges]
        sizes = np.array([item.shape[0] for item in items])
        if self.data.ndim == 2:
            data = np.vstack(items)
        else:
            data = np.hstack(items)
        indptr = np.append(0, sizes.cumsum())
        return data, indptr

    def take(self, idxs: Sequence[int]) -> Tuple[np.ndarray, np.ndarray]:
        """Subset specific groups by their indices."""
        ranges = [range(self.indptr[i], self.indptr[i + 1]) for i in idxs]
        return self._take_from_ranges(ranges)

    def take_from_groups(self, idx: Union[int, slice]) -> Tuple[np.ndarray, np.ndarray]:
        """Select a subset from each group."""
        if isinstance(idx, int):
            # this preserves the 2d structure of data when indexing with the range
            idx = slice(idx, idx + 1)
        ranges = [
            range(self.indptr[i], self.indptr[i + 1])[idx]
            for i in range(self.n_groups)
        ]
        return self._take_from_ranges(ranges)

    def append(self, new: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Appends each element of `new` to each existing group. Returns a copy."""
        if new.shape[0] != self.n_groups:
            raise ValueError(f"new must have {self.n_groups} rows.")
        return _append_one(self.data, self.indptr, new)

    def append_several(
        self, new_sizes: np.ndarray, new_values: np.ndarray, new_groups: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray]:
        return _append_several(
            self.data, self.indptr, new_sizes, new_values, new_groups
        )

    def __repr__(self):
        return (
            f"{self.__class__.__name__}(n_rows={self.data.shape[0]:,}, n_groups={self.n_groups:,})"
        )

In [None]:
from fastcore.test import test_eq, test_fail

from utilsforecast.data import generate_series

In [None]:
# The `GroupedArray` is used internally for storing the series values and performing transformations.
data = np.arange(20, dtype=np.float32).reshape(-1, 2)
indptr = np.array([0, 2, 10])  # group 1: [0, 1], group 2: [2..9]
ga = GroupedArray(data, indptr)
test_eq(len(ga), 2)

In [None]:
# Iterate through the groups
ga_iter = iter(ga)
np.testing.assert_equal(next(ga_iter), np.arange(4).reshape(-1, 2))
np.testing.assert_equal(next(ga_iter), np.arange(4, 20).reshape(-1, 2))

In [None]:
# Take the last two observations from each group
last2_data, last2_indptr = ga.take_from_groups(slice(-2, None))
np.testing.assert_equal(
    last2_data,
    np.vstack([
        np.arange(4).reshape(-1, 2),
        np.arange(16, 20).reshape(-1, 2),
    ]),
)
np.testing.assert_equal(last2_indptr, np.array([0, 2, 4]))

# 1d
ga1d = GroupedArray(np.arange(10), indptr)
last2_data1d, last2_indptr1d = ga1d.take_from_groups(slice(-2, None))
np.testing.assert_equal(
    last2_data1d,
    np.array([0, 1, 8, 9])
)
np.testing.assert_equal(last2_indptr1d, np.array([0, 2, 4]))

In [None]:
# Take the second observation from each group
second_data, second_indptr = ga.take_from_groups(1)
np.testing.assert_equal(second_data, np.array([[2, 3], [6, 7]]))
np.testing.assert_equal(second_indptr, np.array([0, 1, 2]))

# 1d
second_data1d, second_indptr1d = ga1d.take_from_groups(1)
np.testing.assert_equal(second_data1d, np.array([1, 3]))
np.testing.assert_equal(second_indptr1d, np.array([0, 1, 2]))

In [None]:
# Take the last four observations from every group. Note that since group 1 only has two elements, only these are returned.
last4_data, last4_indptr = ga.take_from_groups(slice(-4, None))
np.testing.assert_equal(
    last4_data,
    np.vstack([
        np.arange(4).reshape(-1, 2),
        np.arange(12, 20).reshape(-1, 2),
    ]),
)
np.testing.assert_equal(last4_indptr, np.array([0, 2, 6]))

# 1d
last4_data1d, last4_indptr1d = ga1d.take_from_groups(slice(-4, None))
np.testing.assert_equal(
    last4_data1d,
    np.array([0, 1, 6, 7, 8, 9])
)
np.testing.assert_equal(last4_indptr1d, np.array([0, 2, 6]))

In [None]:
# Select a specific subset of groups
indptr = np.array([0, 2, 4, 7, 10])
ga2 = GroupedArray(data, indptr)
subset = GroupedArray(*ga2.take([0, 2]))
np.testing.assert_allclose(subset[0].data, ga2[0].data)
np.testing.assert_allclose(subset[1].data, ga2[2].data)

# 1d
ga2_1d = GroupedArray(np.arange(10), indptr)
subset1d = GroupedArray(*ga2_1d.take([0, 2]))
np.testing.assert_allclose(subset1d[0].data, ga2_1d[0].data)
np.testing.assert_allclose(subset1d[1].data, ga2_1d[2].data)

In [None]:
# try to append new values that don't match the number of groups
test_fail(lambda: ga.append(np.array([1., 2., 3.])), contains='new must have 2 rows')

In [None]:
#| polars
# build from df
series_pd = generate_series(10, static_as_categorical=False, engine='pandas')
ga_pd = GroupedArray.from_sorted_df(series_pd, 'unique_id', 'ds', 'y')
series_pl = generate_series(10, static_as_categorical=False, engine='polars')
ga_pl = GroupedArray.from_sorted_df(series_pl, 'unique_id', 'ds', 'y')
np.testing.assert_allclose(ga_pd.data, ga_pl.data)
np.testing.assert_equal(ga_pd.indptr, ga_pl.indptr)