In [1]:
import timeit
from typing import Any, Callable, Optional, Union

import numpy as np
import pandas as pd

from cyclops.utils.common import array_series_conversion

In [64]:
def random_mask(n: int, percent_true: float = 0.5) -> np.ndarray:
    mask = np.full(n, False)
    mask[: int(n * percent_true)] = True
    np.random.shuffle(mask)
    return mask

In [139]:
n = 1000000
arr = np.arange(n).astype(float)
nan_mask = random_mask(n)
arr[nan_mask] = np.nan
arr[:10]

array([ 0.,  1.,  2.,  3.,  4., nan,  6.,  7.,  8.,  9.])

In [None]:
# class Imputer1D

In [141]:
def np_ffill(arr: np.ndarray) -> np.ndarray:
    mask = np.isnan(arr)
    idx = np.where(~mask, np.arange(mask.shape[0]), 0)
    idx = np.maximum.accumulate(idx, axis=0, out=idx)
    return arr[idx]

In [142]:
def np_bfill(arr: np.ndarray) -> np.ndarray:
    mask = np.isnan(arr)
    idx = np.where(~mask, np.arange(mask.shape[0]), mask.shape[0] - 1)
    idx = np.minimum.accumulate(idx[::-1], axis=0)[::-1]
    return arr[idx]

In [143]:
def ffill(data: Union[np.ndarray, pd.Series]) -> Union[np.ndarray, pd.Series]:
    if isinstance(data, pd.Series):
        return data.ffill()
    if isinstance(data, np.ndarray):
        n = data.shape[0]
        if n < 100000:
            return np_ffill(data)
        else:
            return pd.Series(data).ffill().values

In [144]:
def bfill(data: Union[np.ndarray, pd.Series]) -> Union[np.ndarray, pd.Series]:
    if isinstance(data, pd.Series):
        return data.bfill()
    if isinstance(data, np.ndarray):
        n = data.shape[0]
        if n <= 100000:
            return np_bfill(data)
        else:
            return pd.Series(data).bfill().values

In [None]:
# Add to normalization - e.g., get rid of ".values" and handling of pandas series!
# Can also be used for imputation

In [243]:
ffill(arr)

<class 'numpy.ndarray'>
<class 'NoneType'>


In [146]:
bfill(arr)

array([ 0.,  1.,  2., ..., nan, nan, nan])

In [117]:
%timeit np_ffill(arr)

1.17 ms ± 16.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [118]:
%timeit pd.Series(arr).ffill()

1.18 ms ± 20.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [119]:
series = pd.Series(arr)

In [120]:
%timeit series.ffill()

1.13 ms ± 11.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [121]:
%timeit pd.Series(np_ffill(series.values))

1.23 ms ± 46.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
