In [1]:
#default_exp core

In [2]:
#hide
%load_ext autoreload
%autoreload 2

# Core

In [3]:
#hide
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [57]:
#export
import inspect
import logging
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import numpy as np
import pandas as pd
from tqdm import notebook

In [58]:
#exporti
logging.basicConfig(
    format='%(asctime)s %(name)s %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)
logger = logging.getLogger(__name__)

In [59]:
#hide
logger.setLevel(logging.ERROR)

In [60]:
from fastcore.test import test_eq

from statsforecast.models import (
    adida,
    auto_arima,
    croston_classic,
    croston_optimized,
    croston_sba,
    historic_average,
    imapa,
    naive,
    random_walk_with_drift,
    seasonal_exponential_smoothing,
    seasonal_naive,
    seasonal_window_average,
    ses,
    tsb,
    window_average,
)
from statsforecast.utils import generate_series

In [189]:
#exporti
class GroupedArray:
    
    def __init__(self, data, indptr, pid=0):
        self.data = data
        self.indptr = indptr
        self.n_groups = self.indptr.size - 1
        self.pid = pid
        
    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self.data[self.indptr[idx] : self.indptr[idx + 1]]
        elif isinstance(idx, slice):
            idx = slice(idx.start, idx.stop + 1, idx.step)
            new_indptr = self.indptr[idx].copy()
            new_data = self.data[new_indptr[0] : new_indptr[-1]].copy()            
            new_indptr -= new_indptr[0]
            return new_data, new_indptr
        raise ValueError(f'idx must be either int or slice, got {type(idx)}')
    
    def __len__(self):
        return self.n_groups
    
    def __repr__(self):
        return f'GroupedArray(n_data={self.data.size:,}, n_groups={self.n_groups:,})'
    
    def __eq__(self, other):
        if not hasattr(other, 'data') or not hasattr(other, 'indptr'):
            return False
        return np.allclose(self.data, other.data) and np.array_equal(self.indptr, other.indptr)
    
    def compute_forecasts(self, h, func, xreg=None, level=None, *args):
        has_level = 'level' in inspect.signature(func).parameters and level is not None
        if has_level:
            out = np.full((h * self.n_groups, 2 * len(level) + 1), np.nan, dtype=np.float32)
            func = partial(func, level=level)
        else:
            out = np.full(h * self.n_groups, np.nan, dtype=np.float32)
        xr = None
        keys = None
        tqdm_text = "Process #" + f"{self.pid}".zfill(3) + f"{func.__name__}".center(70, '-')
        print(' ', end='', flush=True)
        with notebook.tqdm(total=len(self), desc=tqdm_text, position=self.pid+1) as pbar:
            for i, grp in enumerate(self):
                if xreg is not None:
                    xr = xreg[i]
                res = func(grp, h, xr, *args)                
                if has_level:
                    if keys is None:
                        keys = list(res.keys())
                    for j, key in enumerate(keys):
                        out[h * i : h * (i + 1), j] = res[key]
                else:
                    out[h * i : h * (i + 1)] = res
                pbar.update(1)
        return out, keys

    def split(self, n_chunks):
        return [GroupedArray(*self[x[0] : x[-1] + 1], pid) for pid, x in enumerate(np.array_split(range(self.n_groups), n_chunks)) if x.size]

In [190]:
#hide
data = np.arange(12)
indptr = np.array([0, 4, 8, 12])
ga = GroupedArray(data, indptr)

test_eq(len(ga), 3)
np.testing.assert_equal(
    ga.compute_forecasts(2, naive)[0],
    np.hstack([2 * [data[i]] for i in indptr[1:] - 1]),
)
splits = ga.split(2)
test_eq(splits[0], GroupedArray(data[:8], indptr[:3]))
test_eq(splits[1], GroupedArray(data[8:], np.array([0, 4])))

 

Process #000--------------------------------naive---------------------------------:   0%|          | 0/3 [00:0…

In [191]:
#exporti
def _grouped_array_from_df(df):
    df = df.set_index('ds', append=True)
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()
    data = df.values.astype(np.float32)
    indices_sizes = df.index.get_level_values('unique_id').value_counts(sort=False)
    indices = indices_sizes.index
    sizes = indices_sizes.values
    cum_sizes = sizes.cumsum()
    dates = df.index.get_level_values('ds')[cum_sizes - 1]
    indptr = np.append(0, cum_sizes).astype(np.int32)
    return GroupedArray(data, indptr), indices, dates

In [192]:
#hide
series = generate_series(10_000, n_static_features=2, equal_ends=False)
sorted_series = series.sort_values(['unique_id', 'ds'])
unsorted_series = sorted_series.sample(frac=1.0)
ga, indices, dates = _grouped_array_from_df(unsorted_series)

np.testing.assert_allclose(ga.data, sorted_series.drop(columns='ds').values)
test_eq(indices, sorted_series.index.unique(level='unique_id'))
test_eq(dates, series.groupby('unique_id')['ds'].max().values)

In [193]:
#exporti
def _build_forecast_name(model, *args) -> str:
    model_name = f'{model.__name__}'
    func_params = inspect.signature(model).parameters
    func_args = list(func_params.items())[3:]  # remove input array, horizon and xreg
    changed_params = [
        f'{name}-{value}'
        for value, (name, arg) in zip(args, func_args)
        if arg.default != value
    ]
    if changed_params:
        model_name += '_' + '_'.join(changed_params)
    return model_name

In [194]:
#hide
test_eq(_build_forecast_name(ses, 0.1), 'ses_alpha-0.1')
test_eq(_build_forecast_name(seasonal_window_average, 7, 4), 'seasonal_window_average_season_length-7_window_size-4')

In [195]:
#exporti
def _as_tuple(x):
    if isinstance(x, tuple):
        return x
    return (x,)

In [196]:
#hide
test_eq(_as_tuple((1,)), (1,))
test_eq(_as_tuple(1), (1,))

In [197]:
#export
class StatsForecast:
    
    def __init__(self, df, models, freq, n_jobs=1):
        self.ga, self.uids, self.last_dates = _grouped_array_from_df(df)
        self.models = models
        self.freq = pd.tseries.frequencies.to_offset(freq)
        self.n_jobs = n_jobs
        
    def forecast(self, h, xreg=None, level=None):
        if xreg is not None:
            expected_shape = (h * len(self.ga), self.ga.data.shape[1])
            if xreg.shape != expected_shape:
                raise ValueError(f'Expected xreg to have shape {expected_shape}, but got {xreg.shape}')
            xreg, _, _ = _grouped_array_from_df(xreg)
        if self.n_jobs == 1:
            fcsts = self._sequential_forecast(h, xreg, level)
        else:
            fcsts = self._data_parallel_forecast(h, xreg, level)
        if issubclass(self.last_dates.dtype.type, np.integer):
            dates = np.hstack([
                np.arange(last_date + 1, last_date + 1 + h, dtype=self.last_dates.dtype)
                for last_date in self.last_dates            
            ])
        else:
            dates = np.hstack([
                pd.date_range(last_date + self.freq, periods=h, freq=self.freq)
                for last_date in self.last_dates            
            ])
        idx = pd.Index(np.repeat(self.uids, h), name='unique_id')
        return pd.DataFrame({'ds': dates, **fcsts}, index=idx)
        
    def _sequential_forecast(self, h, xreg, level):
        fcsts = {}
        logger.info('Computing forecasts')
        for model_args in self.models:
            model, *args = _as_tuple(model_args)
            model_name = _build_forecast_name(model, *args)
            values, keys = self.ga.compute_forecasts(h, model, xreg, level, *args)
            if keys is not None:
                for j, key in enumerate(keys):
                    fcsts[f'{model_name}_{key}'] = values[:, j]
            else:
                fcsts[model_name] = values
            logger.info(f'Computed forecasts for {model_name}.')
        return fcsts
    
    def _data_parallel_forecast(self, h, xreg, level):
        fcsts = {}
        logger.info('Computing forecasts')
        gas = self.ga.split(self.n_jobs)
        if xreg is not None:
            xregs = xreg.split(self.n_jobs)
        else:
            from itertools import repeat
            
            xregs = repeat(None)
        with ProcessPoolExecutor(self.n_jobs) as executor:
            for model_args in self.models:
                model, *args = _as_tuple(model_args)
                model_name = _build_forecast_name(model, *args)
                futures = []
                for ga, xr in zip(gas, xregs):
                    future = executor.submit(ga.compute_forecasts, h, model, xr, level, *args)
                    futures.append(future)
                values, keys = list(zip(*[f.result() for f in futures]))
                keys = keys[0]
                if keys is not None:
                    values = np.vstack(values)
                    for j, key in enumerate(keys):
                        fcsts[f'{model_name}_{key}'] = values[:, j]
                else:
                    values = np.hstack(values)
                    fcsts[model_name] = values
                logger.info(f'Computed forecasts for {model_name}.')
        return fcsts

## Daily data

In [198]:
fcst = StatsForecast(
    series,
    [adida, croston_classic, croston_optimized,
     croston_sba, historic_average, imapa, naive, 
     random_walk_with_drift, (seasonal_exponential_smoothing, 7, 0.1),
     (seasonal_naive, 7), (seasonal_window_average, 7, 4),
     (ses, 0.1), (tsb, 0.1, 0.3), (window_average, 4)],
    freq='D',
)
res = fcst.forecast(14)
res

 

Process #000--------------------------------adida---------------------------------:   0%|          | 0/10000 […

 

Process #000---------------------------croston_classic----------------------------:   0%|          | 0/10000 […

 

Process #000--------------------------croston_optimized---------------------------:   0%|          | 0/10000 […

 

Process #000-----------------------------croston_sba------------------------------:   0%|          | 0/10000 […

 

Process #000---------------------------historic_average---------------------------:   0%|          | 0/10000 […

 

Process #000--------------------------------imapa---------------------------------:   0%|          | 0/10000 […

 

Process #000--------------------------------naive---------------------------------:   0%|          | 0/10000 […

 

Process #000------------------------random_walk_with_drift------------------------:   0%|          | 0/10000 […

 

Process #000--------------------seasonal_exponential_smoothing--------------------:   0%|          | 0/10000 […

 

Process #000----------------------------seasonal_naive----------------------------:   0%|          | 0/10000 […

 

Process #000-----------------------seasonal_window_average------------------------:   0%|          | 0/10000 […

 

Process #000---------------------------------ses----------------------------------:   0%|          | 0/10000 […

 

Process #000---------------------------------tsb----------------------------------:   0%|          | 0/10000 […

 

Process #000----------------------------window_average----------------------------:   0%|          | 0/10000 […

Unnamed: 0_level_0,ds,adida,croston_classic,croston_optimized,croston_sba,historic_average,imapa,naive,random_walk_with_drift,seasonal_exponential_smoothing_season_length-7_alpha-0.1,seasonal_naive_season_length-7,seasonal_window_average_season_length-7_window_size-4,ses_alpha-0.1,tsb_alpha_d-0.1_alpha_p-0.3,window_average_window_size-4
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,2000-08-10,157.559219,157.559219,157.559219,149.681259,161.040253,157.559219,215.188721,216.134415,11.505998,270.386292,258.436707,157.559219,157.559219,134.071350
0,2000-08-11,157.559219,157.559219,157.559219,149.681259,161.040253,157.559219,215.188721,217.080093,65.213455,316.671570,315.926270,157.559219,157.559219,134.071350
0,2000-08-12,157.559219,157.559219,157.559219,149.681259,161.040253,157.559219,215.188721,218.025787,109.913399,15.088776,11.100096,157.559219,157.559219,134.071350
0,2000-08-13,157.559219,157.559219,157.559219,149.681259,161.040253,157.559219,215.188721,218.971466,158.784393,67.045593,63.376709,157.559219,157.559219,134.071350
0,2000-08-14,157.559219,157.559219,157.559219,149.681259,161.040253,157.559219,215.188721,219.917160,210.499176,101.606117,111.795639,157.559219,157.559219,134.071350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999,2000-06-27,87.646744,87.646744,87.646744,83.264404,78.274399,87.646744,149.732315,149.786240,27.759584,56.131752,52.532112,87.646744,87.646744,114.524994
9999,2000-06-28,87.646744,87.646744,87.646744,83.264404,78.274399,87.646744,149.732315,149.791641,53.950005,81.664482,78.404900,87.646744,87.646744,114.524994
9999,2000-06-29,87.646744,87.646744,87.646744,83.264404,78.274399,87.646744,149.732315,149.797028,78.058998,101.653671,101.620743,87.646744,87.646744,114.524994
9999,2000-06-30,87.646744,87.646744,87.646744,83.264404,78.274399,87.646744,149.732315,149.802429,101.626221,125.049500,125.748337,87.646744,87.646744,114.524994


In [199]:
#hide
test_eq(res.index.unique(), fcst.uids)
last_dates = series.groupby('unique_id')['ds'].max()
test_eq(res.groupby('unique_id')['ds'].min().values, last_dates + pd.offsets.Day())
test_eq(res.groupby('unique_id')['ds'].max().values, last_dates + 14 * pd.offsets.Day())

### Parallel

In [200]:
try: from nbdev.imports import IN_NOTEBOOK
except: IN_NOTEBOOK=False
if __name__=="__main__" and not IN_NOTEBOOK:
    fcst = StatsForecast(
        series,
        [adida, (ses, 0.1), historic_average, croston_classic],
        freq='D',
        n_jobs=2
    )
    res = fcst.forecast(14)
    print(res)

In [201]:
fcst = StatsForecast(
    series,
    [adida, (ses, 0.1), historic_average, croston_classic],
    freq='D',
    n_jobs=8
)
res = fcst.forecast(14)
#print(res)

    

Process #000--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

 

Process #001--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

Process #002--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

 

Process #003--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

 

Process #004--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

Process #005--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

 

Process #006--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

Process #007--------------------------------adida---------------------------------:   0%|          | 0/1250 [0…

   

Process #000---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

 

Process #001---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

  

Process #002---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

Process #004---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

 

Process #003---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

Process #005---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

Process #006---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

 

Process #007---------------------------------ses----------------------------------:   0%|          | 0/1250 [0…

   

Process #000---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

 

Process #001---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

 

Process #002---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

Process #003---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

 

Process #004---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

 

Process #005---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

 

Process #006---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

Process #007---------------------------historic_average---------------------------:   0%|          | 0/1250 [0…

   

Process #000---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

  

Process #001---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

Process #002---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

 

Process #003---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

 

Process #004---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

 

Process #005---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

Process #006---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

Process #007---------------------------croston_classic----------------------------:   0%|          | 0/1250 [0…

## Monthly data

In [None]:
monthly_series = generate_series(10_000, freq='M', min_length=10, max_length=20, equal_ends=True)
monthly_series

In [None]:
fcst = StatsForecast(
    monthly_series,
    [adida, (ses, 0.1), historic_average, croston_classic],
    freq='M'
)
%time monthly_res = fcst.forecast(4)
monthly_res

In [None]:
fcst.ga.split(2)[1].data

In [None]:
#hide
last_dates = monthly_series.groupby('unique_id')['ds'].max()
test_eq(monthly_res.groupby('unique_id')['ds'].min().values, fcst.last_dates + pd.offsets.MonthEnd())
test_eq(monthly_res.groupby('unique_id')['ds'].max().values, fcst.last_dates + 4 * pd.offsets.MonthEnd())

## Integer datestamp

In [None]:
from statsforecast.utils import AirPassengers as ap

In [None]:
int_ds_df = pd.DataFrame({'ds': np.arange(1, len(ap) + 1), 'y': ap})
int_ds_df.insert(0, 'unique_id', 'AirPassengers')
int_ds_df.set_index('unique_id', inplace=True)
int_ds_df.head()

In [None]:
int_ds_df.tail()

In [None]:
fcst = StatsForecast(int_ds_df, models=[historic_average], freq='D')
horizon = 7
forecast = fcst.forecast(horizon)
forecast.head()

In [None]:
last_date = int_ds_df['ds'].max()
test_eq(forecast['ds'].values, np.arange(last_date + 1, last_date + 1 + horizon))

## External regressors

Every column after **y** is considered an external regressor and will be passed to the models that allow them. If you use them you must supply the future values to the `forecast` method.

In [None]:
def linear_regression(X, h, future_xreg):
    y = X[:, 0]
    xreg = X[:, 1:]
    coefs, *_ = np.linalg.lstsq(xreg, y, rcond=None)
    return future_xreg @ coefs

In [None]:
series_xreg = series = generate_series(10_000, equal_ends=True)
series_xreg['intercept'] = 1
series_xreg['dayofweek'] = series_xreg['ds'].dt.dayofweek
series_xreg = pd.get_dummies(series_xreg, columns=['dayofweek'], drop_first=True)
series_xreg

In [None]:
dates = sorted(series_xreg['ds'].unique())
valid_start = dates[-14]
train_mask = series_xreg['ds'] < valid_start
series_train = series_xreg[train_mask]
series_valid = series_xreg[~train_mask]
X_valid = series_valid.drop(columns=['y'])
fcst = StatsForecast(
    series_train,
    [linear_regression],
    freq='D',
)
%time xreg_res = fcst.forecast(14, xreg=X_valid)
xreg_res['y'] = series_valid['y'].values

In [None]:
xreg_res.groupby('ds').mean().plot();

## Confidence intervals

In [None]:
ap_df = pd.DataFrame({'ds': np.arange(ap.size), 'y': ap}, index=pd.Index([0] * ap.size, name='unique_id'))
fcst = StatsForecast(
    ap_df,
    [(seasonal_naive, 12), (auto_arima, 12)],
    freq='M',
)
ap_ci = fcst.forecast(12, level=(80, 95))
ap_ci.set_index('ds').plot(marker='.', figsize=(10, 6));

## n jobs

In [None]:
ap_df_2 = pd.DataFrame(
    {'ds': np.hstack([np.arange(ap.size), np.arange(ap.size)]), 
     'y': np.hstack([ap, ap])}, 
    index=pd.Index([0] * ap.size + [1] * ap.size, name='unique_id')
)

In [None]:
if __name__=="__main__" and not IN_NOTEBOOK:
    ap_df = pd.DataFrame({'ds': np.arange(ap.size), 'y': ap}, index=pd.Index([0] * ap.size, name='unique_id'))
    fcst = StatsForecast(
        ap_df,
        [(seasonal_naive, 12), (auto_arima, 12)],
        freq='M',
        n_jobs=101
    )
    ap_ci = fcst.forecast(12, level=(80, 95))
    ap_ci.set_index('ds').plot(marker='.', figsize=(10, 6))

In [None]:
#hide
def return_xreg(X, h, xreg, *args):
    return xreg[:, 0]

In [None]:
#hide
df = pd.DataFrame(
    {
        'ds': np.hstack([np.arange(10), np.arange(10)]),
        'y': np.random.rand(20),
        'x': np.arange(20, dtype=np.float32),
    },
    index=pd.Index([0] * 10 + [1] * 10, name='unique_id'),
)
train_mask = df['ds'] < 6
train_df = df[train_mask]
test_df = df[~ train_mask]

In [None]:
#hide
fcst = StatsForecast(
    train_df,
    [return_xreg],
    freq='M',
    n_jobs=1,
)
xreg = test_df.drop(columns='y')
res = fcst.forecast(4, xreg=xreg)
expected_res = xreg.rename(columns={'x': 'return_xreg'})
pd.testing.assert_frame_equal(res, expected_res, check_dtype=False)

In [None]:
#hide
if __name__=="__main__" and not IN_NOTEBOOK:
    fcst = StatsForecast(
        train_df,
        [return_xreg],
        freq='M',
        n_jobs=2,
    )
    xreg = test_df.drop(columns='y')
    res = fcst.forecast(4, xreg=xreg)
    expected_res = xreg.rename(columns={'x': 'return_xreg'})
    pd.testing.assert_frame_equal(res, expected_res, check_dtype=False)