In [13]:
#default_exp core

In [14]:
#hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Core

In [15]:
#export
import inspect
import logging
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from itertools import chain

import numpy as np
import pandas as pd
from numba import njit

from statsforecast.utils import generate_series

In [16]:
#exporti
logging.basicConfig(
    format='%(asctime)s %(name)s %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [17]:
from statsforecast.models import adida, ses, historic_average, croston_classic

In [18]:
series = generate_series(10_000, equal_ends=True)
series

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-10-05,0.123838
0,2000-10-06,1.460113
0,2000-10-07,2.418577
0,2000-10-08,3.372232
0,2000-10-09,4.241948
...,...,...
9999,2001-05-10,2.338823
9999,2001-05-11,3.402687
9999,2001-05-12,4.235569
9999,2001-05-13,5.210396


In [19]:
#export
class GroupedArray:
    
    def __init__(self, data, indptr):
        self.data = data
        self.indptr = indptr
        self.n_groups = self.indptr.size - 1
        
    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self.data[self.indptr[idx] : self.indptr[idx + 1]]
        elif isinstance(idx, slice):
            idx = slice(idx.start, idx.stop + 1, idx.step)
            new_indptr = self.indptr[idx].copy()
            new_data = self.data[new_indptr[0] : new_indptr[-1]].copy()            
            new_indptr -= new_indptr[0]
            return GroupedArray(new_data, new_indptr)
        raise ValueError(f'idx must be either int or slice, got {type(idx)}')
    
    def __len__(self):
        return self.n_groups
    
    def __repr__(self):
        return f'GroupedArray(n_data={self.data.size:,}, n_groups={self.n_groups:,})'
    
    def compute_forecasts(self, h, func, *args):
        out = np.full(h * self.n_groups, np.nan, dtype=np.float32)
        for i, grp in enumerate(self):
            out[h * i : h * (i + 1)] = func(grp, h, *args)
        return out
    
    def split(self, n_chunks):
        return [self[x[0] : x[-1] + 1] for x in np.array_split(range(self.n_groups), n_chunks)]

In [20]:
#exporti
def _grouped_array_from_df(df):
    df = df.set_index('ds', append=True)
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
    data = df['y'].values.astype(np.float32)
    df = df.reset_index('ds')
    indices_sizes = df.index.value_counts(sort=False)
    indices = indices_sizes.index
    sizes = indices_sizes.values
    cum_sizes = sizes.cumsum()
    dates = df['ds'].values[cum_sizes - 1]
    indptr = np.append(0, cum_sizes).astype(np.int32)
    return GroupedArray(data, indptr), indices, pd.DatetimeIndex(dates)

In [21]:
%time ga, *_ = _grouped_array_from_df(series)
ga

CPU times: user 409 ms, sys: 80.1 ms, total: 489 ms
Wall time: 488 ms


GroupedArray(n_data=2,769,354, n_groups=10,000)

In [22]:
%time yy = ga.compute_forecasts(14, ses, 0.1)

CPU times: user 429 ms, sys: 3.66 ms, total: 433 ms
Wall time: 443 ms


In [23]:
%timeit ga.compute_forecasts(14, ses, 0.1)

36.7 ms ± 2.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
#exporti
def _build_forecast_name(model, *args) -> str:
    model_name = f'{model.__name__}'
    func_params = inspect.signature(model).parameters
    func_args = list(func_params.items())[2:]  # remove input array and horizon
    changed_params = [
        f'{name}-{value}'
        for value, (name, arg) in zip(args, func_args)
        if arg.default != value
    ]
    if changed_params:
        model_name += '_' + '_'.join(changed_params)
    return model_name

def _as_tuple(x):
    if isinstance(x, tuple):
        return x
    return (x, )

In [25]:
np.repeat(np.array([1, 2]), 2)

array([1, 1, 2, 2])

In [26]:
np.hstack([pd.date_range('2021-01-01', periods=2, freq='D'), pd.date_range('2021-01-01', periods=2, freq='D')])

array(['2021-01-01T00:00:00.000000000', '2021-01-02T00:00:00.000000000',
       '2021-01-01T00:00:00.000000000', '2021-01-02T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [27]:
#export
class StatsForecast:
    
    def __init__(self, df, models, freq, n_jobs=1):
        self.ga, self.uids, self.last_dates = _grouped_array_from_df(df)
        self.models = models
        self.freq = pd.tseries.frequencies.to_offset(freq)
        self.n_jobs = n_jobs
        
    def forecast(self, h):
        if self.n_jobs == 1:
            fcsts = self._sequential_forecast(h)
        else:
            fcsts = self._data_parallel_forecast(h)
        dates = np.hstack([
            pd.date_range(last_date + self.freq, periods=h, freq=self.freq)
            for last_date in self.last_dates            
        ])
        idx = pd.Index(np.repeat(self.uids, h), name='unique_id')
        return pd.DataFrame({'ds': dates, **fcsts}, index=idx)
        
    def _sequential_forecast(self, h):
        fcsts = {}
        logger.info('Computing forecasts')
        for model_args in self.models:
            model, *args = _as_tuple(model_args)
            model_name = _build_forecast_name(model, *args)
            fcsts[model_name] = self.ga.compute_forecasts(h, model, *args)
            logger.info(f'Computed forecasts for {model_name}.')
        return fcsts
    
    def _data_parallel_forecast(self, h):
        fcsts = {}
        logger.info('Computing forecasts')
        gas = self.ga.split(self.n_jobs)
        with ProcessPoolExecutor(self.n_jobs) as executor:
            for model_args in self.models:
                model, *args = _as_tuple(model_args)
                model_name = _build_forecast_name(model, *args)
                futures = []
                for ga in gas:
                    future = executor.submit(ga.compute_forecasts, h, model, *args)
                    futures.append(future)
                fcsts[model_name] = np.hstack([f.result() for f in futures])
                logger.info(f'Computed forecasts for {model_name}.')
        return fcsts

In [28]:
fcst = StatsForecast(
    series,
    [adida, (ses, 0.1), (ses, 0.2), (ses, 0.3), historic_average, croston_classic],
    freq='D',
)
%time res1 = fcst.forecast(14)
res1

2021-12-06 16:36:01 __main__ INFO: Computing forecasts
2021-12-06 16:36:08 __main__ INFO: Computed forecasts for adida.
2021-12-06 16:36:08 __main__ INFO: Computed forecasts for ses_alpha-0.1.
2021-12-06 16:36:08 __main__ INFO: Computed forecasts for ses_alpha-0.2.
2021-12-06 16:36:08 __main__ INFO: Computed forecasts for ses_alpha-0.3.
2021-12-06 16:36:09 __main__ INFO: Computed forecasts for historic_average.
2021-12-06 16:36:10 __main__ INFO: Computed forecasts for croston_classic.


CPU times: user 9.76 s, sys: 21.5 ms, total: 9.78 s
Wall time: 9.85 s


Unnamed: 0_level_0,ds,adida,ses_alpha-0.1,ses_alpha-0.2,ses_alpha-0.3,historic_average,croston_classic
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2001-05-15,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-16,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-17,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-18,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-19,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
...,...,...,...,...,...,...,...
9999,2001-05-24,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947
9999,2001-05-25,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947
9999,2001-05-26,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947
9999,2001-05-27,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947


In [29]:
fcst = StatsForecast(
    series,
    [adida, (ses, 0.1), (ses, 0.2), (ses, 0.3), historic_average, croston_classic],
    freq='D',
    n_jobs=2,
)
%time res2 = fcst.forecast(14)
res2

2021-12-06 16:36:11 __main__ INFO: Computing forecasts
2021-12-06 16:36:15 __main__ INFO: Computed forecasts for adida.
2021-12-06 16:36:15 __main__ INFO: Computed forecasts for ses_alpha-0.1.
2021-12-06 16:36:15 __main__ INFO: Computed forecasts for ses_alpha-0.2.
2021-12-06 16:36:15 __main__ INFO: Computed forecasts for ses_alpha-0.3.
2021-12-06 16:36:15 __main__ INFO: Computed forecasts for historic_average.
2021-12-06 16:36:17 __main__ INFO: Computed forecasts for croston_classic.


CPU times: user 938 ms, sys: 68.4 ms, total: 1.01 s
Wall time: 6.28 s


Unnamed: 0_level_0,ds,adida,ses_alpha-0.1,ses_alpha-0.2,ses_alpha-0.3,historic_average,croston_classic
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2001-05-15,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-16,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-17,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-18,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
0,2001-05-19,3.151185,3.151185,3.126454,3.162569,3.220805,3.151185
...,...,...,...,...,...,...,...
9999,2001-05-24,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947
9999,2001-05-25,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947
9999,2001-05-26,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947
9999,2001-05-27,3.651947,3.651947,4.103254,4.544015,3.261435,3.651947


In [30]:
pd.testing.assert_frame_equal(res1, res2)

In [41]:
# test monthly data
series = generate_series(10_000, freq='M', min_length=10, max_length=20, equal_ends=True)
series

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-06-30,0.317078
0,2000-07-31,1.183993
0,2000-08-31,2.458650
0,2000-09-30,3.396637
0,2000-10-31,4.160418
...,...,...
9999,2001-04-30,7.087452
9999,2001-05-31,8.106541
9999,2001-06-30,9.162334
9999,2001-07-31,10.052648


In [42]:
fcst = StatsForecast(
    series,
    [adida, (ses, 0.1), (ses, 0.2), (ses, 0.3), historic_average, croston_classic],
    freq='M',
)
%time res1 = fcst.forecast(4)
res1

2021-12-06 16:42:20 __main__ INFO: Computing forecasts
2021-12-06 16:42:27 __main__ INFO: Computed forecasts for adida.
2021-12-06 16:42:27 __main__ INFO: Computed forecasts for ses_alpha-0.1.
2021-12-06 16:42:27 __main__ INFO: Computed forecasts for ses_alpha-0.2.
2021-12-06 16:42:28 __main__ INFO: Computed forecasts for ses_alpha-0.3.
2021-12-06 16:42:28 __main__ INFO: Computed forecasts for historic_average.
2021-12-06 16:42:28 __main__ INFO: Computed forecasts for croston_classic.


CPU times: user 8.39 s, sys: 4.43 ms, total: 8.39 s
Wall time: 8.54 s


Unnamed: 0_level_0,ds,adida,ses_alpha-0.1,ses_alpha-0.2,ses_alpha-0.3,historic_average,croston_classic
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2001-09-30,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
0,2001-10-31,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
0,2001-11-30,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
0,2001-12-31,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
1,2001-09-30,6.407427,5.631709,6.417505,6.407427,6.588876,5.631709
...,...,...,...,...,...,...,...
9998,2001-12-31,4.352780,5.783700,4.686199,4.352780,5.083481,5.783700
9999,2001-09-30,7.166113,7.166113,8.153630,9.056795,6.789791,7.166113
9999,2001-10-31,7.166113,7.166113,8.153630,9.056795,6.789791,7.166113
9999,2001-11-30,7.166113,7.166113,8.153630,9.056795,6.789791,7.166113


In [43]:
fcst = StatsForecast(
    series,
    [adida, (ses, 0.1), (ses, 0.2), (ses, 0.3), historic_average, croston_classic],
    freq='M',
    n_jobs=2,
)
%time res2 = fcst.forecast(4)
res2

2021-12-06 16:42:29 __main__ INFO: Computing forecasts
2021-12-06 16:42:33 __main__ INFO: Computed forecasts for adida.
2021-12-06 16:42:33 __main__ INFO: Computed forecasts for ses_alpha-0.1.
2021-12-06 16:42:33 __main__ INFO: Computed forecasts for ses_alpha-0.2.
2021-12-06 16:42:33 __main__ INFO: Computed forecasts for ses_alpha-0.3.
2021-12-06 16:42:33 __main__ INFO: Computed forecasts for historic_average.
2021-12-06 16:42:33 __main__ INFO: Computed forecasts for croston_classic.


CPU times: user 1.31 s, sys: 44.8 ms, total: 1.36 s
Wall time: 5.04 s


Unnamed: 0_level_0,ds,adida,ses_alpha-0.1,ses_alpha-0.2,ses_alpha-0.3,historic_average,croston_classic
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2001-09-30,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
0,2001-10-31,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
0,2001-11-30,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
0,2001-12-31,4.069533,4.101110,4.601393,4.069533,4.895469,4.101110
1,2001-09-30,6.407427,5.631709,6.417505,6.407427,6.588876,5.631709
...,...,...,...,...,...,...,...
9998,2001-12-31,4.352780,5.783700,4.686199,4.352780,5.083481,5.783700
9999,2001-09-30,7.166113,7.166113,8.153630,9.056795,6.789791,7.166113
9999,2001-10-31,7.166113,7.166113,8.153630,9.056795,6.789791,7.166113
9999,2001-11-30,7.166113,7.166113,8.153630,9.056795,6.789791,7.166113


In [44]:
pd.testing.assert_frame_equal(res1, res2)