In [None]:
#default_exp core

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
import logging

import numpy as np
import pandas as pd
from numba import njit

from statsforecast.utils import generate_daily_series

In [None]:
#exporti
logging.basicConfig(
    format='%(asctime)s %(name)s %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [None]:
from statsforecast.models import adida, ses, historic_average, croston_classic

In [None]:
series = generate_daily_series(50_000, equal_ends=True)
series

In [None]:
#export
class GroupedArray:
    
    def __init__(self, data, indptr):
        self.data = data
        self.indptr = indptr
        self.n_groups = self.indptr.size - 1
        
    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self.data[self.indptr[idx] : self.indptr[idx + 1]]
        elif isinstance(idx, slice):
            idx = slice(idx.start, idx.stop + 1, idx.step)
            new_indptr = self.indptr[idx].copy()
            new_data = self.data[new_indptr[0] : new_indptr[-1]].copy()            
            new_indptr -= new_indptr[0]
            return GroupedArray(new_data, new_indptr)
        raise ValueError(f'idx must be either int or slice, got {type(idx)}')
    
    def __len__(self):
        return self.n_groups
    
    def __repr__(self):
        return f'GroupedArray(n_data={self.data.size:,}, n_groups={self.n_groups:,})'
    
    def compute_forecasts(self, h, func, *args):
        out = np.full(h * self.n_groups, np.nan, dtype=np.float32)
        for i, grp in enumerate(self):
            out[h * i : h * (i + 1)] = func(grp, h, *args)
        return out
    
    def split(self, n_chunks):
        return [self[x[0] : x[-1] + 1] for x in np.array_split(range(self.n_groups), n_chunks)]

In [None]:
#export
def grouped_array_from_df(df):
    df = df.sort_values(['unique_id', 'ds'])
    sizes = df.groupby('unique_id').size()
    data = df['y'].values.astype(np.float32)
    indptr = np.append(0, sizes.cumsum()).astype(np.int32)
    return GroupedArray(data, indptr)

In [None]:
ga = grouped_array_from_df(series)
ga

In [None]:
%time yy = ga.compute_forecasts(14, ses, 0.1)

In [None]:
%timeit ga.compute_forecasts(14, ses, 0.1)

In [None]:
def plot_fcst(ga, h, func, *args):
    future_dates = pd.date_range(series['ds'].max(), periods=14, freq='D')
    idx = pd.Index(np.hstack([future_dates for _ in range(ga.n_groups)]), name='ds')
    fcst = pd.Series(ga.compute_forecasts(h, func, *args), index=idx)
    ax = series.groupby('ds')['y'].sum().tail(50).plot(marker='.', figsize=(16, 6))
    fcst.groupby('ds').sum().plot(ax=ax, marker='.');

In [None]:
%time plot_fcst(ga, 14, ses, 0.1)

In [None]:
%time plot_fcst(ga, 14, historic_average)

In [None]:
%time plot_fcst(ga, 14, croston_classic)

In [None]:
%time plot_fcst(ga, 14, adida)

In [None]:
ga.split(5)

In [None]:
#exporti
import inspect

def _build_forecast_name(model, *args) -> str:
    model_name = f'{model.__name__}'
    func_params = inspect.signature(model).parameters
    func_args = list(func_params.items())[2:]  # remove input array and horizon
    changed_params = [
        f'{name}-{value}'
        for value, (name, arg) in zip(args, func_args)
        if arg.default != value
    ]
    if changed_params:
        model_name += '_' + '_'.join(changed_params)
    return model_name

def _as_tuple(x):
    if isinstance(x, tuple):
        return x
    return (x, )

In [None]:
def compute_forecast(ga, h, model, *args):
    return ga.compute_forecasts(h, model, *args)

In [None]:
from concurrent.futures import ProcessPoolExecutor
from functools import partial

class StatsForecast:
    
    def __init__(self, df, models, n_jobs=1):
        self.ga = grouped_array_from_df(df)
        self.models = models
        self.n_jobs = n_jobs
        
    def forecast(self, h):
        fcsts = {}
        logger.info('Computing forecasts')
        for model_args in self.models:
            model, *args = _as_tuple(model_args)
            model_name = _build_forecast_name(model, *args)
            fcsts[model_name] = self.ga.compute_forecasts(h, model, *args)
            logger.info(f'Computed forecasts for {model_name}.')
        return pd.DataFrame(fcsts)
    
    def data_parallel_forecast(self, h):
        fcsts = {}
        logger.info('Computing forecasts')
        gas = self.ga.split(self.n_jobs)
        with ProcessPoolExecutor(self.n_jobs) as executor:
            for model_args in self.models:
                model, *args = _as_tuple(model_args)
                model_name = _build_forecast_name(model, *args)
                forecast_func = partial(compute_forecast, h=h, model=model, *args)
                futures = []
                for ga in gas:
                    future = executor.submit(ga.compute_forecasts, h, model, *args)
                    futures.append(future)
                fcsts[model_name] = np.hstack([f.result() for f in futures])
                logger.info(f'Computed forecasts for {model_name}.')
        return pd.DataFrame(fcsts)

In [None]:
fcst = StatsForecast(series, [adida, (ses, 0.1), (ses, 0.2), (ses, 0.3), historic_average, croston_classic], n_jobs=2)

In [None]:
%time res1 = fcst.forecast(14)
res1

In [None]:
%time res2 = fcst.data_parallel_forecast(14)
res2

In [None]:
pd.testing.assert_frame_equal(res1, res2)