In [None]:
#default_exp core

In [None]:
#hide
%load_ext autoreload
%autoreload 2

# Core

In [None]:
#export
import inspect
import logging
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import pandas as pd

In [None]:
#exporti
logging.basicConfig(
    format='%(asctime)s %(name)s %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [None]:
from fastcore.test import test_eq

from statsforecast.models import (
    adida,
    croston_classic,
    historic_average,
    naive,
    seasonal_naive,
    seasonal_window_average,
    ses,
)
from statsforecast.utils import generate_series

In [None]:
series = generate_series(10_000, equal_ends=False)
series

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-01-01,0.123838
0,2000-01-02,1.460113
0,2000-01-03,2.418577
0,2000-01-04,3.372232
0,2000-01-05,4.241948
...,...,...
9999,2000-06-13,2.338823
9999,2000-06-14,3.402687
9999,2000-06-15,4.235569
9999,2000-06-16,5.210396


In [None]:
#exporti
class GroupedArray:
    
    def __init__(self, data, indptr):
        self.data = data
        self.indptr = indptr
        self.n_groups = self.indptr.size - 1
        
    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self.data[self.indptr[idx] : self.indptr[idx + 1]]
        elif isinstance(idx, slice):
            idx = slice(idx.start, idx.stop + 1, idx.step)
            new_indptr = self.indptr[idx].copy()
            new_data = self.data[new_indptr[0] : new_indptr[-1]].copy()            
            new_indptr -= new_indptr[0]
            return GroupedArray(new_data, new_indptr)
        raise ValueError(f'idx must be either int or slice, got {type(idx)}')
    
    def __len__(self):
        return self.n_groups
    
    def __repr__(self):
        return f'GroupedArray(n_data={self.data.size:,}, n_groups={self.n_groups:,})'
    
    def __eq__(self, other):
        if not hasattr(other, 'data') or not hasattr(other, 'indptr'):
            return False
        return np.allclose(self.data, other.data) and np.array_equal(self.indptr, other.indptr)
    
    def compute_forecasts(self, h, func, *args):
        out = np.full(h * self.n_groups, np.nan, dtype=np.float32)
        for i, grp in enumerate(self):
            out[h * i : h * (i + 1)] = func(grp, h, *args)
        return out
    
    def split(self, n_chunks):
        return [self[x[0] : x[-1] + 1] for x in np.array_split(range(self.n_groups), n_chunks)]

In [None]:
#hide
data = np.arange(12)
indptr = np.array([0, 4, 8, 12])
ga = GroupedArray(data, indptr)

test_eq(len(ga), 3)
np.testing.assert_equal(
    ga.compute_forecasts(2, naive),
    np.hstack([2 * [data[i]] for i in indptr[1:] - 1]),
)
splits = ga.split(2)
test_eq(splits[0], GroupedArray(data[:8], indptr[:3]))
test_eq(splits[1], GroupedArray(data[8:], np.array([0, 4])))

In [None]:
#exporti
def _grouped_array_from_df(df):
    df = df.set_index('ds', append=True)
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
    data = df['y'].values.astype(np.float32)
    indices_sizes = df.index.get_level_values('unique_id').value_counts(sort=False)
    indices = indices_sizes.index
    sizes = indices_sizes.values
    cum_sizes = sizes.cumsum()
    dates = df.index.get_level_values('ds')[cum_sizes - 1]
    indptr = np.append(0, cum_sizes).astype(np.int32)
    return GroupedArray(data, indptr), indices, dates

In [None]:
#hide
sorted_series = series.sort_values(['unique_id', 'ds'])
unsorted_series = sorted_series.sample(frac=1.0)
ga, indices, dates = _grouped_array_from_df(unsorted_series)

np.testing.assert_allclose(ga.data, sorted_series['y'].values)
test_eq(indices, sorted_series.index.unique(level='unique_id'))
test_eq(dates, series.groupby('unique_id')['ds'].max().values)

In [None]:
#exporti
def _build_forecast_name(model, *args) -> str:
    model_name = f'{model.__name__}'
    func_params = inspect.signature(model).parameters
    func_args = list(func_params.items())[2:]  # remove input array and horizon
    changed_params = [
        f'{name}-{value}'
        for value, (name, arg) in zip(args, func_args)
        if arg.default != value
    ]
    if changed_params:
        model_name += '_' + '_'.join(changed_params)
    return model_name

In [None]:
#hide
test_eq(_build_forecast_name(ses, 0.1), 'ses_alpha-0.1')
test_eq(_build_forecast_name(seasonal_window_average, 7, 4), 'seasonal_window_average_season_length-7_window_size-4')

In [None]:
#exporti
def _as_tuple(x):
    if isinstance(x, tuple):
        return x
    return (x,)

In [None]:
#hide
test_eq(_as_tuple((1,)), (1,))
test_eq(_as_tuple(1), (1,))

In [None]:
#export
class StatsForecast:
    
    def __init__(self, df, models, freq, n_jobs=1):
        self.ga, self.uids, self.last_dates = _grouped_array_from_df(df)
        self.models = models
        self.freq = pd.tseries.frequencies.to_offset(freq)
        self.n_jobs = n_jobs
        
    def forecast(self, h):
        if self.n_jobs == 1:
            fcsts = self._sequential_forecast(h)
        else:
            fcsts = self._data_parallel_forecast(h)
        if issubclass(self.last_dates.dtype.type, np.integer):
            dates = np.hstack([
                np.arange(last_date + 1, last_date + 1 + h, dtype=self.last_dates.dtype)
                for last_date in self.last_dates            
            ])
        else:
            dates = np.hstack([
                pd.date_range(last_date + self.freq, periods=h, freq=self.freq)
                for last_date in self.last_dates            
            ])
        idx = pd.Index(np.repeat(self.uids, h), name='unique_id')
        return pd.DataFrame({'ds': dates, **fcsts}, index=idx)
        
    def _sequential_forecast(self, h):
        fcsts = {}
        logger.info('Computing forecasts')
        for model_args in self.models:
            model, *args = _as_tuple(model_args)
            model_name = _build_forecast_name(model, *args)
            fcsts[model_name] = self.ga.compute_forecasts(h, model, *args)
            logger.info(f'Computed forecasts for {model_name}.')
        return fcsts
    
    def _data_parallel_forecast(self, h):
        fcsts = {}
        logger.info('Computing forecasts')
        gas = self.ga.split(self.n_jobs)
        with ProcessPoolExecutor(self.n_jobs) as executor:
            for model_args in self.models:
                model, *args = _as_tuple(model_args)
                model_name = _build_forecast_name(model, *args)
                futures = []
                for ga in gas:
                    future = executor.submit(ga.compute_forecasts, h, model, *args)
                    futures.append(future)
                fcsts[model_name] = np.hstack([f.result() for f in futures])
                logger.info(f'Computed forecasts for {model_name}.')
        return fcsts

## Daily data

In [None]:
fcst = StatsForecast(
    series,
    [adida, (ses, 0.1), historic_average, croston_classic],
    freq='D',
)
%time res = fcst.forecast(14)
res

2022-02-17 00:19:49 __main__ INFO: Computing forecasts
2022-02-17 00:19:53 __main__ INFO: Computed forecasts for adida.
2022-02-17 00:19:53 __main__ INFO: Computed forecasts for ses_alpha-0.1.
2022-02-17 00:19:54 __main__ INFO: Computed forecasts for historic_average.
2022-02-17 00:19:54 __main__ INFO: Computed forecasts for croston_classic.


CPU times: user 5.46 s, sys: 58.9 ms, total: 5.52 s
Wall time: 5.48 s


Unnamed: 0_level_0,ds,adida,ses_alpha-0.1,historic_average,croston_classic
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-08-10,3.151185,3.151185,3.220805,3.151185
0,2000-08-11,3.151185,3.151185,3.220805,3.151185
0,2000-08-12,3.151185,3.151185,3.220805,3.151185
0,2000-08-13,3.151185,3.151185,3.220805,3.151185
0,2000-08-14,3.151185,3.151185,3.220805,3.151185
...,...,...,...,...,...
9999,2000-06-27,3.651947,3.651947,3.261435,3.651947
9999,2000-06-28,3.651947,3.651947,3.261435,3.651947
9999,2000-06-29,3.651947,3.651947,3.261435,3.651947
9999,2000-06-30,3.651947,3.651947,3.261435,3.651947


In [None]:
#hide
test_eq(res.index.unique(), fcst.uids)
last_dates = series.groupby('unique_id')['ds'].max()
test_eq(res.groupby('unique_id')['ds'].min().values, last_dates + pd.offsets.Day())
test_eq(res.groupby('unique_id')['ds'].max().values, last_dates + 14 * pd.offsets.Day())

## Monthly data

In [None]:
monthly_series = generate_series(10_000, freq='M', min_length=10, max_length=20, equal_ends=True)
monthly_series

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-06-30,0.317078
0,2000-07-31,1.183993
0,2000-08-31,2.458650
0,2000-09-30,3.396637
0,2000-10-31,4.160418
...,...,...
9999,2001-04-30,7.087452
9999,2001-05-31,8.106541
9999,2001-06-30,9.162334
9999,2001-07-31,10.052648


In [None]:
fcst = StatsForecast(
    monthly_series,
    [adida, (ses, 0.1), historic_average, croston_classic],
    freq='M',
)
%time monthly_res = fcst.forecast(4)
monthly_res

2022-02-17 00:19:55 __main__ INFO: Computing forecasts
2022-02-17 00:19:58 __main__ INFO: Computed forecasts for adida.
2022-02-17 00:19:58 __main__ INFO: Computed forecasts for ses_alpha-0.1.
2022-02-17 00:19:58 __main__ INFO: Computed forecasts for historic_average.
2022-02-17 00:19:58 __main__ INFO: Computed forecasts for croston_classic.


CPU times: user 4.11 s, sys: 5.45 ms, total: 4.12 s
Wall time: 4.12 s


Unnamed: 0_level_0,ds,adida,ses_alpha-0.1,historic_average,croston_classic
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2001-09-30,4.069533,4.101110,4.895469,4.101110
0,2001-10-31,4.069533,4.101110,4.895469,4.101110
0,2001-11-30,4.069533,4.101110,4.895469,4.101110
0,2001-12-31,4.069533,4.101110,4.895469,4.101110
1,2001-09-30,6.407427,5.631709,6.588876,5.631709
...,...,...,...,...,...
9998,2001-12-31,4.352780,5.783700,5.083481,5.783700
9999,2001-09-30,7.166113,7.166113,6.789791,7.166113
9999,2001-10-31,7.166113,7.166113,6.789791,7.166113
9999,2001-11-30,7.166113,7.166113,6.789791,7.166113


In [None]:
#hide
last_dates = monthly_series.groupby('unique_id')['ds'].max()
test_eq(monthly_res.groupby('unique_id')['ds'].min().values, fcst.last_dates + pd.offsets.MonthEnd())
test_eq(monthly_res.groupby('unique_id')['ds'].max().values, fcst.last_dates + 4 * pd.offsets.MonthEnd())

## Integer datestamp

In [None]:
from statsforecast.utils import AirPassengers as ap

In [None]:
int_ds_df = pd.DataFrame({'ds': np.arange(1, len(ap) + 1), 'y': ap})
int_ds_df.insert(0, 'unique_id', 'AirPassengers')
int_ds_df.set_index('unique_id', inplace=True)
int_ds_df.head()

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AirPassengers,1,112.0
AirPassengers,2,118.0
AirPassengers,3,132.0
AirPassengers,4,129.0
AirPassengers,5,121.0


In [None]:
int_ds_df.tail()

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AirPassengers,140,606.0
AirPassengers,141,508.0
AirPassengers,142,461.0
AirPassengers,143,390.0
AirPassengers,144,432.0


In [None]:
fcst = StatsForecast(int_ds_df, models=[historic_average], freq='D')
horizon = 7
forecast = fcst.forecast(horizon)
forecast.head()

2022-02-17 00:25:11 __main__ INFO: Computing forecasts
2022-02-17 00:25:11 __main__ INFO: Computed forecasts for historic_average.


Unnamed: 0_level_0,ds,historic_average
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AirPassengers,145,280.298615
AirPassengers,146,280.298615
AirPassengers,147,280.298615
AirPassengers,148,280.298615
AirPassengers,149,280.298615


In [None]:
last_date = int_ds_df['ds'].max()
test_eq(forecast['ds'].values, np.arange(last_date + 1, last_date + 1 + horizon))