In [1]:
#default_exp core

In [2]:
#hide
%load_ext autoreload
%autoreload 2

# Core

In [3]:
#hide
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

from nbdev.showdoc import add_docs, show_doc
from statsforecast.models import naive, ses, seasonal_window_average

In [4]:
#export
import inspect
import logging
from copy import deepcopy
from functools import partial
from os import cpu_count
from typing import Any, Callable, List, Optional, Tuple

import numpy as np
import pandas as pd

In [5]:
#exporti
logging.basicConfig(
    format='%(asctime)s %(name)s %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)
logger = logging.getLogger(__name__)

In [6]:
#hide
logger.setLevel(logging.ERROR)

In [7]:
#hide
from fastcore.test import test_eq, test_fail
from statsforecast.utils import generate_series

In [8]:
#exporti
class GroupedArray:
    
    def __init__(self, data, indptr):
        self.data = data
        self.indptr = indptr
        self.n_groups = self.indptr.size - 1
        
    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self.data[self.indptr[idx] : self.indptr[idx + 1]]
        elif isinstance(idx, slice):
            idx = slice(idx.start, idx.stop + 1, idx.step)
            new_indptr = self.indptr[idx].copy()
            new_data = self.data[new_indptr[0] : new_indptr[-1]].copy()            
            new_indptr -= new_indptr[0]
            return GroupedArray(new_data, new_indptr)
        raise ValueError(f'idx must be either int or slice, got {type(idx)}')
    
    def __len__(self):
        return self.n_groups
    
    def __repr__(self):
        return f'GroupedArray(n_data={self.data.size:,}, n_groups={self.n_groups:,})'
    
    def __eq__(self, other):
        if not hasattr(other, 'data') or not hasattr(other, 'indptr'):
            return False
        return np.allclose(self.data, other.data) and np.array_equal(self.indptr, other.indptr)
    
    def fit(self, model):
        fm = np.full(self.n_groups, np.nan, dtype=object)
        for i, grp in enumerate(self):
            y = grp[:, 0] if grp.ndim == 2 else grp
            X = grp[:, 1:] if (grp.ndim == 2 and grp.shape[1] > 1) else None
            fm[i] = deepcopy(model).fit(y=y, X=X)
        return fm
    
    def predict(self, fm, h, X=None, level=tuple()):
        #fm stands for fitted_models
        #and fm should have fitted_model
        #from the same class
        has_level = 'level' in inspect.signature(fm[0].predict).parameters and len(level)
        kwargs = {}
        if has_level:
            kwargs['level'] = level
        fcsts = np.full((h * self.n_groups, 2 * len(level) + 1), np.nan, dtype=np.float32)
        for i, _ in enumerate(self):
            if X is not None:
                X_ = X[i]
            else:
                X_ = None
            fcsts_i = fm[i].predict(h=h, X=X_, **kwargs)
            fcsts[h * i : h * (i + 1), : 2 * has_level * len(level) + 1] = fcsts_i                
        return fcsts
    
    def predict_in_sample(self, fm, level=tuple()):
        has_level = 'level' in inspect.signature(fm[0].predict_in_sample).parameters and len(level)
        kwargs = {}
        if has_level:
            kwargs['level'] = level
        in_sample = np.full((self.data.shape[0],  2 * len(level) + 1), np.nan, dtype=np.float32)
        for i, _ in enumerate(self):
            if has_level:
                in_sample_i = fm[i].predict_in_sample(level=level)
            else:
                in_sample_i = fm[i].predict_in_sample()
            in_sample[self.indptr[i] : self.indptr[i + 1], : 2 * has_level * len(level) + 1] = in_sample_i
        return in_sample
    
    def fit_predict(self, model, h, X=None, level=tuple()):
        #fitted models
        fm = self.fit(model=model)
        #forecasts
        fcsts = self.predict(fm=fm, h=h, X=X, level=level)
        return fm, fcsts
    
    def compute_cv(self, model, h, test_size, step_size=1, input_size=None):
        # output of size: (ts, window, h)
        if (test_size - h) % step_size:
            raise Exception('`test_size - h` should be module `step_size`')
        n_windows = int((test_size - h) / step_size) + 1
        out = np.full((self.n_groups, n_windows, h), np.nan, dtype=np.float32)
        out_test = np.full((self.n_groups, n_windows, h), np.nan, dtype=np.float32)
        for i_ts, grp in enumerate(self):
            for i_window, cutoff in enumerate(range(-test_size, -h + 1, step_size), start=0):
                end_cutoff = cutoff + h
                in_size_disp = cutoff if input_size is None else input_size
                #train dataset
                y_train = grp[(cutoff - in_size_disp):cutoff]
                y_train = y_train[:, 0] if y_train.ndim == 2 else y_train
                X_train = y_train[:, 1:] if (y_train.ndim == 2 and y_train.shape[1] > 1) else None
                #test dataset
                y_test = grp[cutoff:] if end_cutoff == 0 else grp[cutoff:end_cutoff]
                y_test = y_test[:, 0] if y_test.ndim == 2 else y_test
                X_test = y_test[:, 1:] if (y_test.ndim == 2 and y_test.shape[1] > 1) else None
                # fit model
                out[i_ts, i_window] = model.fit(y=y_train, X=X_train).predict(h=h, X=X_test)
                out_test[i_ts, i_window] = y_test
        result = {'forecasts': out, 'y': out_test}
        return result

    def split(self, n_chunks):
        return [self[x[0] : x[-1] + 1] for x in np.array_split(range(self.n_groups), n_chunks) if x.size]

In [9]:
#hide
#class to test the core methods
from scipy.stats import norm
class Naive:
    
    def __init__(self):
        pass
    
    def fit(self, y: np.ndarray, X: np.ndarray = None):
        self.last_value = y[-1]
        self.fitted_vals = np.full(y.size, np.nan, np.float32)
        self.fitted_vals[1:] = np.roll(y, 1)[1:]
        self.errors = y - self.fitted_vals
        return self
        
    def predict(self, h: int, X: np.ndarray = None, level = tuple()):
        preds = np.repeat(self.last_value, h)[:, None]
        if len(level):
            se = np.mean(self.errors[1:] ** 2)
            quantiles = norm.ppf(0.5 * (1 + np.asarray(level) / 100))
            lower = preds.reshape(-1, 1) - quantiles * se.reshape(-1, 1)
            upper = preds.reshape(-1, 1) + quantiles * se.reshape(-1, 1)
            preds = np.hstack([preds, lower, upper])
        return preds
    
    def predict_in_sample(self, level = tuple()):
        return self.fitted_vals[:, None]

In [10]:
#hide
#data used for tests
data = np.arange(12)
indptr = np.array([0, 4, 8, 12])

# test we can recover the 
# number of series
ga = GroupedArray(data, indptr)
test_eq(len(ga), 3)

#test splits of data
splits = ga.split(2)
test_eq(splits[0], GroupedArray(data[:8], indptr[:3]))
test_eq(splits[1], GroupedArray(data[8:], np.array([0, 4])))

# fitting models for each ts
model = Naive()
fitted_models = ga.fit(model=model)
test_eq(len(fitted_models), 3)

# test forecasts
exp_fcsts = np.hstack([2 * [data[i]] for i in indptr[1:] - 1])
np.testing.assert_equal(
    ga.predict(fm=fitted_models, h=2),
    exp_fcsts[:, None],
)

# test expected fitted_values
exp_fitted = np.array([np.nan, 1., 1., 1.])
exp_fitted = np.hstack([ga[i] - exp_fitted for i in range(3)])
np.testing.assert_equal(
    ga.predict_in_sample(fm=fitted_models),
    exp_fitted[:, None],
)

#test fit and predict pipelie
fitted_models, fcsts = ga.fit_predict(model=model, h=2) 
test_eq(len(fitted_models), 3)
np.testing.assert_equal(exp_fcsts[:, None], fcsts)

#test levels
fitted_models, fcsts = ga.fit_predict(model=model, h=2, level=(50, 90))

In [11]:
#hide
# tests for cross valiation
data = np.hstack([np.arange(10), np.arange(100, 200), np.arange(20, 40)])
indptr = np.array([0, 10, 110, 130])
ga = GroupedArray(data, indptr)

# sum ahead just returns the last value
# added with h future values 
class SumAhead:
    
    def __init__(self):
        pass
    
    def fit(self, y, X):
        self.last_value = y[-1]
        self.fitted_values = np.full(y.size, np.nan, np.float32)
        self.fitted_values[1:] = y[:1]
        return self
    
    def predict(self, h, X=None):
        return self.last_value + np.arange(1, h + 1)
model = SumAhead()
res_cv = ga.compute_cv(model=model, h=2, test_size=5)
test_eq(res_cv['forecasts'], res_cv['y'])

In [12]:
#hide
actual_step_size = np.unique(np.diff(res_cv['forecasts'], axis=1))
test_eq(actual_step_size, 1)

In [13]:
#hide
horizons = [1, 2, 3, 2]
test_sizes = [3, 4, 6, 6]
step_sizes = [2, 2, 3, 4]
for h, test_size, step_size in zip(horizons, test_sizes, step_sizes):
    res_cv = ga.compute_cv(model=model, h=h, test_size=test_size, step_size=step_size)
    test_eq(res_cv['forecasts'], res_cv['y'])
    actual_step_size = np.unique(np.diff(res_cv['forecasts'], axis=1))
    test_eq(actual_step_size, step_size)
    actual_n_windows = res_cv['forecasts'].shape[1]
    test_eq(actual_n_windows, int((test_size - h)/step_size) + 1)

In [14]:
#hide 
def fail_cv(h, test_size, step_size):
    return ga.compute_cv(model=model, h=h, test_size=test_size, step_size=step_size)
test_fail(fail_cv, contains='module', kwargs=dict(h=2, test_size=5, step_size=2))

In [15]:
#exporti
def _grouped_array_from_df(df, sort_df):
    df = df.set_index('ds', append=True)
    if not df.index.is_monotonic_increasing and sort_df:
        df = df.sort_index()
    data = df.values.astype(np.float32)
    indices_sizes = df.index.get_level_values('unique_id').value_counts(sort=False)
    indices = indices_sizes.index
    sizes = indices_sizes.values
    cum_sizes = sizes.cumsum()
    dates = df.index.get_level_values('ds')[cum_sizes - 1]
    indptr = np.append(0, cum_sizes).astype(np.int32)
    return GroupedArray(data, indptr), indices, dates, df.index

In [16]:
#hide
series = generate_series(10_000, n_static_features=2, equal_ends=False)
sorted_series = series.sort_values(['unique_id', 'ds'])
unsorted_series = sorted_series.sample(frac=1.0)
ga, indices, dates, ds = _grouped_array_from_df(unsorted_series, sort_df=True)

np.testing.assert_allclose(ga.data, sorted_series.drop(columns='ds').values)
test_eq(indices, sorted_series.index.unique(level='unique_id'))
test_eq(dates, series.groupby('unique_id')['ds'].max().values)

In [17]:
#exporti
def _cv_dates(last_dates, freq, h, test_size, step_size=1):
    #assuming step_size = 1
    if (test_size - h) % step_size:
        raise Exception('`test_size - h` should be module `step_size`')
    n_windows = int((test_size - h) / step_size) + 1
    if len(np.unique(last_dates)) == 1:
        if issubclass(last_dates.dtype.type, np.integer):
            total_dates = np.arange(last_dates[0] - test_size + 1, last_dates[0] + 1)
            out = np.empty((h * n_windows, 2), dtype=last_dates.dtype)
            freq = 1
        else:
            total_dates = pd.date_range(end=last_dates[0], periods=test_size, freq=freq)
            out = np.empty((h * n_windows, 2), dtype='datetime64[s]')
        for i_window, cutoff in enumerate(range(-test_size, -h + 1, step_size), start=0):
            end_cutoff = cutoff + h
            out[h * i_window : h * (i_window + 1), 0] = total_dates[cutoff:] if end_cutoff == 0 else total_dates[cutoff:end_cutoff]
            out[h * i_window : h * (i_window + 1), 1] = np.tile(total_dates[cutoff] - freq, h)
        dates = pd.DataFrame(np.tile(out, (len(last_dates), 1)), columns=['ds', 'cutoff'])
    else:
        dates = pd.concat([_cv_dates(np.array([ld]), freq, h, test_size, step_size) for ld in last_dates])
        dates = dates.reset_index(drop=True)
    return dates

In [18]:
#hide
ds_int_cv_test = pd.DataFrame({
    'ds': np.hstack([
        [46, 47, 48],
        [47, 48, 49],
        [48, 49, 50]
    ]),
    'cutoff': [45] * 3 + [46] * 3 + [47] * 3
}, dtype=np.int64)
test_eq(ds_int_cv_test, _cv_dates(np.array([50], dtype=np.int64), 'D', 3, 5))

In [19]:
#hide
ds_int_cv_test = pd.DataFrame({
    'ds': np.hstack([
        [46, 47, 48],
        [48, 49, 50]
    ]),
    'cutoff': [45] * 3 + [47] * 3
}, dtype=np.int64)
test_eq(ds_int_cv_test, _cv_dates(np.array([50], dtype=np.int64), 'D', 3, 5, step_size=2))

In [20]:
#hide
for e_e in [True, False]:
    n_series = 2
    ga, indices, dates, ds = _grouped_array_from_df(generate_series(n_series, equal_ends=e_e), sort_df=True)
    freq = pd.tseries.frequencies.to_offset('D')
    horizon = 3
    test_size = 5
    df_dates = _cv_dates(last_dates=dates, freq=freq, h=horizon, test_size=test_size)
    test_eq(len(df_dates), n_series * horizon * (test_size - horizon + 1)) 

In [None]:
#exporti
def _build_forecast_name(model, *args, idx_remove=4) -> str:
    model_name = f'{model.__name__}'
    func_params = inspect.signature(model).parameters
    func_args = list(func_params.items())[idx_remove:]  # remove input array, horizon and xreg
    changed_params = [
        f'{name}-{value}'
        for value, (name, arg) in zip(args, func_args)
        if arg.default != value
    ]
    if changed_params:
        model_name += '_' + '_'.join(changed_params)
    return model_name

In [None]:
#hide
test_eq(_build_forecast_name(ses, 0.1), 'ses_alpha-0.1')
test_eq(_build_forecast_name(seasonal_window_average, 7, 4), 'seasonal_window_average_season_length-7_window_size-4')

In [None]:
#exporti
def _as_tuple(x):
    if isinstance(x, tuple):
        return x
    return (x,)

In [None]:
#hide
test_eq(_as_tuple((1,)), (1,))
test_eq(_as_tuple(1), (1,))

In [None]:
#exporti
def _get_n_jobs(n_groups, n_jobs, ray_address):
    if ray_address is not None:
        logger.info(
            'Using ray address,'
            'using available resources insted of `n_jobs`'
        )
        try:
            import ray
        except ModuleNotFoundError as e:
            msg = (
                '{e}. To use a ray cluster you have to install '
                'ray. Please run `pip install ray`. '
            )
            raise ModuleNotFoundError(msg) from e
        if not ray.is_initialized():
            ray.init(ray_address, ignore_reinit_error=True)
        actual_n_jobs = int(ray.available_resources()['CPU'])
    else:
        if n_jobs == -1 or (n_jobs is None):
            actual_n_jobs = cpu_count()
        else:
            actual_n_jobs = n_jobs
    return min(n_groups, actual_n_jobs)

In [None]:
#hide
#tests for more series than resources
test_eq(_get_n_jobs(10, -1, None), cpu_count()) 
test_eq(_get_n_jobs(10, None, None), cpu_count())
test_eq(_get_n_jobs(10, 2, None), 2)

In [None]:
#hide
#tests for less series than resources
test_eq(_get_n_jobs(1, -1, None), 1) 
test_eq(_get_n_jobs(1, None, None), 1)
test_eq(_get_n_jobs(2, 10, None), 2)

In [None]:
#export
class StatsForecast:
    
    def __init__(
            self, 
            df: pd.DataFrame, # DataFrame with columns `ds` and `y`, indexed by `unique_id` 
            models: List[Tuple[Callable, Any]], # List of tuples, each containing a fn and its parameters 
            freq: str, # Frequency of the data
            n_jobs: int = 1, # Number of jobs used to parallel processing. Use `-1` to use all cores
            ray_address: Optional[str] = None,  # Optional ray address to distribute jobs
            sort_df: bool = True # Sort `df` according to index and `ds`?
        ):
        # needed for residuals, think about it later
        self.ga, self.uids, self.last_dates, self.ds = _grouped_array_from_df(df, sort_df)
        self.models = models
        self.freq = pd.tseries.frequencies.to_offset(freq)
        self.n_jobs = _get_n_jobs(len(self.ga), n_jobs, ray_address)
        self.ray_address = ray_address
        self.sort_df = sort_df
        
    def forecast(
            self, 
            h: int, # Forecast horizon
            xreg: Optional[pd.DataFrame] = None, # Future exogenous regressors
            fitted: bool = False, # Save fitted values for each model?
            level: Optional[List[int]] = None, # Levels of propabilistic intervals 
        ):
        if xreg is not None:
            expected_shape = (h * len(self.ga), self.ga.data.shape[1])
            if xreg.shape != expected_shape:
                raise ValueError(f'Expected xreg to have shape {expected_shape}, but got {xreg.shape}')
            xreg, _, _, _ = _grouped_array_from_df(xreg, sort_df=self.sort_df)
        forecast_kwargs = dict(
            h=h, test_size=None, step_size=None,
            input_size=None, 
            xreg=xreg, fitted=fitted, 
            level=level, mode='forecast',
        )
        if self.n_jobs == 1:
            res_fcsts = self._sequential(**forecast_kwargs)
        else:
            res_fcsts = self._data_parallel(**forecast_kwargs)
        if fitted:
            self.fcst_fitted_ = res_fcsts['fitted']
        fcsts = res_fcsts['fcsts']
        if issubclass(self.last_dates.dtype.type, np.integer):
            last_date_f = lambda x: np.arange(x + 1, x + 1 + h, dtype=self.last_dates.dtype)
        else:
            last_date_f = lambda x: pd.date_range(x + self.freq, periods=h, freq=self.freq)
        if len(np.unique(self.last_dates)) == 1:
            dates = np.tile(last_date_f(self.last_dates[0]), len(self.ga))
        else:
            dates = np.hstack([
                last_date_f(last_date)
                for last_date in self.last_dates            
            ])
        idx = pd.Index(np.repeat(self.uids, h), name='unique_id')
        return pd.DataFrame({'ds': dates, **fcsts}, index=idx)
    
    def forecast_fitted_values(self):
        if not hasattr(self, 'fcst_fitted_'):
            raise Exception('Please run `forecast` mehtod using `fitted=True`')
        fcst_fitted = {key: val['values'] for key, val in self.fcst_fitted_.items()}
        fcst_fitted['y'] = self.ga.data[:, 0]
        return pd.DataFrame({**fcst_fitted}, index=self.ds).reset_index(level=1)
    
    def cross_validation(
            self, 
            h: int, # Forecast horizon 
            n_windows: int = 1, # Number of windows used for cross validation
            step_size: int = 1, # Step size between each window 
            test_size: Optional[int] = None, # Lenght of test size. If passed, set `n_windows=None`
            input_size: Optional[int] = None, # Input size for each window 
            fitted=False, # Save fitted values for each window and each model?
        ):
        if test_size is None:
            test_size = h + step_size * (n_windows - 1)
        elif n_windows is None:
            if (test_size - h) % step_size:
                raise Exception('`test_size - h` should be module `step_size`')
            n_windows = int((test_size - h) / step_size) + 1
        elif (n_windows is None) and (test_size is None):
            raise Exception('you must define `n_windows` or `test_size`')
        else:
            raise Exception('you must define `n_windows` or `test_size` but not both')
            
        cv_kwargs = dict(
            h=h, test_size=test_size, step_size=step_size, input_size=input_size, 
            xreg=None, fitted=fitted, level=None, mode='cv',
        )
        if self.n_jobs == 1:
            res_fcsts = self._sequential(**cv_kwargs)
        else:
            res_fcsts = self._data_parallel(**cv_kwargs)
        if fitted:
            self.cv_fitted_ = res_fcsts['fitted']
            self.n_cv_ = n_windows
        fcsts = res_fcsts['fcsts']   
        dates = _cv_dates(last_dates=self.last_dates, freq=self.freq, h=h, test_size=test_size, step_size=step_size)
        dates = {'ds': dates['ds'].values, 'cutoff': dates['cutoff'].values}
        idx = pd.Index(np.repeat(self.uids, h * n_windows), name='unique_id')
        return pd.DataFrame({**dates, **fcsts}, index=idx)
    
    def cross_validation_fitted_values(self):
        if not hasattr(self, 'cv_fitted_'):
            raise Exception('Please run `cross_validation` mehtod using `fitted=True`')
        index = pd.MultiIndex.from_tuples(np.tile(self.ds, self.n_cv_), names=['unique_id', 'ds'])
        res = pd.DataFrame(index=index, columns=['cutoff', 'y'] + list(self.cv_fitted_.keys()))
        for model, res_ in self.cv_fitted_.items():
            res[model] = res_['values'].flatten('F')
        res['cutoff'] = res_['last_idxs'].flatten('F')
        res['y'] = np.tile(self.ga.data.flatten(), self.n_cv_)
        idxs = res_['idxs'].flatten('F')
        res = res.iloc[idxs].reset_index(level=1)
        res['cutoff'] = res['ds'].where(res['cutoff']).bfill()
        return res

    def _sequential(self, h, test_size, step_size, input_size, xreg, fitted, level, mode='forecast'):
        result = {'fcsts': {}, 'fitted': {}}
        logger.info('Computing forecasts')
        for model_args in self.models:
            model, *args = _as_tuple(model_args)
            model_name = _build_forecast_name(model, *args)
            if mode == 'forecast':
                res_fcsts = self.ga.compute_forecasts(h, model, xreg, fitted, level, *args)
                values = res_fcsts['forecasts']
                keys = res_fcsts['keys']
            elif mode == 'cv':
                res_fcsts = self.ga.compute_cv(h, test_size, model, step_size, input_size, fitted, *args)
                values = res_fcsts['forecasts']
                test_values = res_fcsts['y']
                keys = None
            if keys is not None:
                for j, key in enumerate(keys):
                    result['fcsts'][f'{model_name}_{key}'] = values[:, j]
            else:
                result['fcsts'][model_name] = values.flatten()
            if fitted:
                result['fitted'][model_name] = res_fcsts['fitted']
            logger.info(f'Computed forecasts for {model_name}.')
        if mode == 'cv':
            result['fcsts'] = {'y': test_values.flatten(), **result['fcsts']}
        return result
    
    def _data_parallel(self, h, test_size, step_size, input_size, xreg, fitted, level, mode='forecast'):
        result = {'fcsts': {}, 'fitted': {}}
        logger.info('Computing forecasts')
        gas = self.ga.split(self.n_jobs)
        if xreg is not None:
            xregs = xreg.split(self.n_jobs)
        else:
            from itertools import repeat
            
            xregs = repeat(None)
        
        if self.ray_address is not None:
            try:
                from ray.util.multiprocessing import Pool
            except ModuleNotFoundError as e:
                msg = (
                    f'{e}. To use a ray cluster you have to install '
                    'ray. Please run `pip install ray`. '
                )
                raise ModuleNotFoundError(msg) from e
            kwargs = dict(ray_address=self.ray_address)
        else:
            from multiprocessing import Pool
            kwargs = dict()
        
        with Pool(self.n_jobs, **kwargs) as executor:
            for model_args in self.models:
                model, *args = _as_tuple(model_args)
                model_name = _build_forecast_name(model, *args)
                futures = []
                for ga, xr in zip(gas, xregs):
                    if mode == 'forecast':
                        future = executor.apply_async(ga.compute_forecasts, (h, model, xr, fitted, level, *args,))
                    elif mode == 'cv':
                        future = executor.apply_async(ga.compute_cv, (h, test_size, model, step_size, input_size, fitted, *args))
                    futures.append(future)
                if mode == 'forecast':
                    res_fcsts = [f.get() for f in futures]
                    values = [d['forecasts'] for d in res_fcsts]
                    keys = [d['keys'] for d in res_fcsts]
                    keys = keys[0]
                elif mode == 'cv':
                    res_fcsts = [f.get() for f in futures]
                    values = [d['forecasts'] for d in res_fcsts]
                    test_values = [d['y'] for d in res_fcsts]
                    keys = None
                if keys is not None:
                    values = np.vstack(values)
                    for j, key in enumerate(keys):
                        result['fcsts'][f'{model_name}_{key}'] = values[:, j]
                else:
                    values = np.hstack([val.flatten() for val in values])
                    result['fcsts'][model_name] = values.flatten()
                if fitted:
                    res = {}
                    for k in res_fcsts[0]['fitted'].keys():
                        res[k] = np.concatenate([d['fitted'][k] for d in res_fcsts])
                    result['fitted'][model_name] = res
                logger.info(f'Computed forecasts for {model_name}.')
        if mode == 'cv':
            test_values = np.vstack(test_values)
            result['fcsts'] = {'y': test_values.flatten(), **result['fcsts']}
        return result

In [None]:
#hide
add_docs(
    StatsForecast, "Compute forecasts using distinct models in paralell.",
    forecast="Compute forecasts",
    cross_validation="Perform cross validation",
    forecast_fitted_values="Return fitted values for each model",
    cross_validation_fitted_values="Return fitted values for the cross validation phase"
)

In [None]:
show_doc(StatsForecast)

The class `StatsForecast` receives a pandas dataframe with columns `ds` (indicating the timestamp of each observation) and `y` (the target variable). It must be indexed by the identifier of each time series (named `unique_id`). 

The class can compute forecasts for several models. The `models` argument is a list of tuples. Each tuple consists of a function (model) and its hyperparameters.

In [None]:
from statsforecast.models import (
    adida,
    auto_arima,
    croston_classic,
    croston_optimized,
    croston_sba,
    ets,
    historic_average,
    imapa,
    naive,
    random_walk_with_drift,
    seasonal_exponential_smoothing,
    seasonal_naive,
    seasonal_window_average,
    ses,
    tsb,
    window_average,
)
from statsforecast.utils import generate_series

series = generate_series(10_000, n_static_features=2, equal_ends=False)

models = [
    naive, adida, croston_classic, croston_optimized,
    croston_sba, historic_average, imapa, naive, 
    random_walk_with_drift, (seasonal_exponential_smoothing, 7, 0.1),
    (seasonal_naive, 7), (seasonal_window_average, 7, 4),
    (ses, 0.1), (tsb, 0.1, 0.3), (window_average, 4)
]

fcst = StatsForecast(
    df=series,
    models=models,
    freq='D',
)

In [None]:
show_doc(StatsForecast.forecast)

With the `StatsForecast.forecast` method you can compute the forecasts for each model in `models`:

In [None]:
res = fcst.forecast(14)
res

In [None]:
#hide
test_eq(res.index.unique(), fcst.uids)
last_dates = series.groupby('unique_id')['ds'].max()
test_eq(res.groupby('unique_id')['ds'].min().values, last_dates + pd.offsets.Day())
test_eq(res.groupby('unique_id')['ds'].max().values, last_dates + 14 * pd.offsets.Day())

In [None]:
#hide
#test for equal ends time series
series_eq_ends = generate_series(100, equal_ends=True)

fcst = StatsForecast(
    series_eq_ends,
    [adida, croston_classic, croston_optimized,
     croston_sba, historic_average, imapa, naive, 
     random_walk_with_drift, (seasonal_exponential_smoothing, 7, 0.1),
     (seasonal_naive, 7), (seasonal_window_average, 7, 4),
     (ses, 0.1), (tsb, 0.1, 0.3), (window_average, 4)],
    freq='D',
)
res = fcst.forecast(14)

test_eq(res.index.unique(), fcst.uids)
last_dates = series_eq_ends.groupby('unique_id')['ds'].max()
test_eq(res.groupby('unique_id')['ds'].min().values, last_dates + pd.offsets.Day())
test_eq(res.groupby('unique_id')['ds'].max().values, last_dates + 14 * pd.offsets.Day())

In [None]:
#hide
#tests for monthly data
monthly_series = generate_series(10_000, freq='M', min_length=10, max_length=20, equal_ends=True)
monthly_series

fcst = StatsForecast(
    monthly_series,
    [adida, (ses, 0.1), historic_average, croston_classic],
    freq='M'
)
%time monthly_res = fcst.forecast(4)
monthly_res

last_dates = monthly_series.groupby('unique_id')['ds'].max()
test_eq(monthly_res.groupby('unique_id')['ds'].min().values, fcst.last_dates + pd.offsets.MonthEnd())
test_eq(monthly_res.groupby('unique_id')['ds'].max().values, fcst.last_dates + 4 * pd.offsets.MonthEnd())

In [None]:
show_doc(StatsForecast.forecast_fitted_values)

Additionaly, you can compute the fitted values for each model. To get them, you need to pass `fitted=True` to the `StatsForecast.forecast` method and then use the `StatsForecast.forecast_fitted_values` method.

In [None]:
fcst = StatsForecast(
    df=series,
    models=[naive],
    freq='D',
)
forecasts = fcst.forecast(14, fitted=True)
fitted = fcst.forecast_fitted_values()

In [None]:
#hide
#tests for fitted values
def test_fcst_fitted(n_jobs=1):
    fitted_fcst = StatsForecast(
        series,
        [naive],
        freq='D',
        n_jobs=n_jobs,
    )
    fitted_res = fitted_fcst.forecast(14, fitted=True)
    fitted = fitted_fcst.forecast_fitted_values()
    test_eq(series['ds'], fitted['ds'])
    test_eq(series['y'].astype(np.float32), fitted['y'])
test_fcst_fitted()

In [None]:
show_doc(StatsForecast.cross_validation)

You can also perform cross validation with `StatsForecast`, using `StatsForecast.cross_validation`.

In [None]:
fcst = StatsForecast(
    df=series,
    models=[naive],
    freq='D',
)
forecasts_cv = fcst.cross_validation(14, n_windows=2)

In [None]:
#hide
#test for cross_validation
series_cv = pd.DataFrame({
    'ds': np.hstack([
        pd.date_range(end='2021-01-01', freq='D', periods=10),
        pd.date_range(end='2022-01-01', freq='D', periods=100),
        pd.date_range(end='2020-01-01', freq='D', periods=20)
    ]),
    'y': np.hstack([np.arange(10.), np.arange(100, 200), np.arange(20, 40)])
}, index=pd.Index(
    data=np.hstack([np.zeros(10), np.zeros(100) + 1, np.zeros(20) + 2]),
    name='unique_id'
))

fcst = StatsForecast(
    series_cv,
    [sum_ahead, naive],
    freq='D'
)
res_cv = fcst.cross_validation(h=2, test_size=5, n_windows=None)
test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))

n_windows = fcst.cross_validation(h=2, n_windows=2).groupby('unique_id').size().unique()
test_eq(n_windows, 2 * 2)
test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))

n_windows = fcst.cross_validation(h=3, n_windows=3, step_size=3, fitted=True).groupby('unique_id').size().unique()
test_eq(n_windows, 3 * 3)
test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))

In [None]:
#hide
#test for equal ends cross_validation
series_cv = pd.DataFrame({
    'ds': np.hstack([
        pd.date_range(end='2022-01-01', freq='D', periods=10),
        pd.date_range(end='2022-01-01', freq='D', periods=100),
        pd.date_range(end='2022-01-01', freq='D', periods=20)
    ]),
    'y': np.hstack([np.arange(10), np.arange(100, 200), np.arange(20, 40)])
}, index=pd.Index(
    data=np.hstack([np.zeros(10), np.zeros(100) + 1, np.zeros(20) + 2]),
    name='unique_id'
))

fcst = StatsForecast(
    series_cv,
    [sum_ahead],
    freq='D',
)
res_cv = fcst.cross_validation(h=2, test_size=5, n_windows=None)
test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))

n_windows = fcst.cross_validation(h=2, n_windows=2).groupby('unique_id').size().unique()
test_eq(n_windows, 2 * 2)
test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))

n_windows = fcst.cross_validation(h=3, n_windows=3, step_size=3).groupby('unique_id').size().unique()
test_eq(n_windows, 3 * 3)
test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))

In [None]:
show_doc(StatsForecast.cross_validation_fitted_values)

To recover the fitted values for each window and each model in the cross validation phase, you have to pass `fitted=True` to the `StatsForecast.cross_validation` method and the use `StatsForecast.cross_validation_fitted_values`.

In [None]:
fcst = StatsForecast(
    df=series[['ds', 'y']],
    models=[naive],
    freq='D',
)
forecasts_cv = fcst.cross_validation(7, n_windows=2, fitted=True)
fitted_cv = fcst.cross_validation_fitted_values()

In [None]:
#hide
#tests for fitted values cross_validation
def test_cv_fitted(n_jobs=1):
    resids_fcst = StatsForecast(
        series_cv,
        [sum_ahead, naive],
        freq='D',
        n_jobs=n_jobs
    )
    resids_res_cv = resids_fcst.cross_validation(h=2, n_windows=4, fitted=True)
    resids_cv = resids_fcst.cross_validation_fitted_values()
    for uid in resids_cv.index.unique():
        for cutoff in resids_cv.loc[uid]['cutoff'].unique():
            pd.testing.assert_frame_equal(
                resids_cv.loc[uid].query('cutoff == @cutoff')[['ds', 'y']], 
                series_cv.query('ds <= @cutoff & unique_id == @uid')[['ds', 'y']],
                check_dtype=False
            )
test_cv_fitted()

In [None]:
#hide
#tests for parallel processing
try: from nbdev.imports import IN_NOTEBOOK
except: IN_NOTEBOOK=False
if __name__=="__main__" and not IN_NOTEBOOK:
    fcst = StatsForecast(
        series,
        [adida, (ses, 0.1), historic_average, croston_classic],
        freq='D',
        n_jobs=2
    )
    res = fcst.forecast(14)
    res_cv = fcst.cross_validation(h=3, test_size=10, n_windows=None)
    print(res)
    print(res_cv)
    fcst = StatsForecast(
        series_cv,
        [sum_ahead],
        freq='D',
    )
    res_cv = fcst.cross_validation(h=2, test_size=5, n_windows=None)
    test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))
    
    test_fcst_fitted(n_jobs=2)
    test_cv_fitted(n_jobs=2)
    # check n_windows argument
    n_windows = fcst.cross_validation(h=2, n_windows=2).groupby('unique_id').size().unique()
    test_eq(n_windows, 2 * 2)
    test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))
    # check step_size argument
    n_windows = fcst.cross_validation(h=3, n_windows=3, step_size=3).groupby('unique_id').size().unique()
    test_eq(n_windows, 3 * 3)
    test_eq(0., np.mean(res_cv['y'] - res_cv['sum_ahead']))

## Integer datestamp

The `StatsForecast` class can also receive integers as datestamp, the following example shows how to do it.

In [None]:
from statsforecast.utils import AirPassengers as ap

In [None]:
int_ds_df = pd.DataFrame({'ds': np.arange(1, len(ap) + 1), 'y': ap})
int_ds_df.insert(0, 'unique_id', 'AirPassengers')
int_ds_df.set_index('unique_id', inplace=True)
int_ds_df.head()

In [None]:
int_ds_df.tail()

In [None]:
fcst = StatsForecast(int_ds_df, models=[historic_average], freq='D')
horizon = 7
forecast = fcst.forecast(horizon)
forecast.head()

In [None]:
last_date = int_ds_df['ds'].max()
test_eq(forecast['ds'].values, np.arange(last_date + 1, last_date + 1 + horizon))

In [None]:
int_ds_cv = fcst.cross_validation(h=7, test_size=8, n_windows=None)
int_ds_cv

## External regressors

Every column after **y** is considered an external regressor and will be passed to the models that allow them. If you use them you must supply the future values to the `StatsForecast.forecast` method.

In [None]:
def linear_regression(X, h, future_xreg, residuals):
    y = X[:, 0]
    xreg = X[:, 1:]
    coefs, *_ = np.linalg.lstsq(xreg, y, rcond=None)
    return {'mean': future_xreg @ coefs}

In [None]:
series_xreg = series = generate_series(10_000, equal_ends=True)
series_xreg['intercept'] = 1
series_xreg['dayofweek'] = series_xreg['ds'].dt.dayofweek
series_xreg = pd.get_dummies(series_xreg, columns=['dayofweek'], drop_first=True)
series_xreg

In [None]:
dates = sorted(series_xreg['ds'].unique())
valid_start = dates[-14]
train_mask = series_xreg['ds'] < valid_start
series_train = series_xreg[train_mask]
series_valid = series_xreg[~train_mask]
X_valid = series_valid.drop(columns=['y'])
fcst = StatsForecast(
    series_train,
    [linear_regression],
    freq='D',
)
%time xreg_res = fcst.forecast(14, xreg=X_valid)
xreg_res['y'] = series_valid['y'].values

In [None]:
xreg_res.groupby('ds').mean().plot()

In [None]:
xreg_res_cv = fcst.cross_validation(h=3, test_size=5, n_windows=None)

In [None]:
#hide
# the following cells contain tests for external regressors

In [None]:
#hide
def return_xreg(X, h, xreg, *args):
    return {'mean': xreg[:, 0]}

In [None]:
#hide
df = pd.DataFrame(
    {
        'ds': np.hstack([np.arange(10), np.arange(10)]),
        'y': np.random.rand(20),
        'x': np.arange(20, dtype=np.float32),
    },
    index=pd.Index([0] * 10 + [1] * 10, name='unique_id'),
)
train_mask = df['ds'] < 6
train_df = df[train_mask]
test_df = df[~train_mask]

In [None]:
#hide
fcst = StatsForecast(
    train_df,
    [return_xreg],
    freq='M',
    n_jobs=1,
)
xreg = test_df.drop(columns='y')
res = fcst.forecast(4, xreg=xreg)
expected_res = xreg.rename(columns={'x': 'return_xreg'})
pd.testing.assert_frame_equal(res, expected_res, check_dtype=False)

In [None]:
#hide
if __name__=="__main__" and not IN_NOTEBOOK:
    fcst = StatsForecast(
        train_df,
        [return_xreg],
        freq='M',
        n_jobs=2,
    )
    xreg = test_df.drop(columns='y')
    res = fcst.forecast(4, xreg=xreg)
    expected_res = xreg.rename(columns={'x': 'return_xreg'})
    pd.testing.assert_frame_equal(res, expected_res, check_dtype=False)

## Confidence intervals

You can pass the argument `level` to the `StatsForecast.forecast` method to calculate confidence intervals. Not all models can calculate them at the moment, so we will only obtain the intervals of those models that have it implemented. 

In [None]:
ap_df = pd.DataFrame({'ds': np.arange(ap.size), 'y': ap}, index=pd.Index([0] * ap.size, name='unique_id'))
fcst = StatsForecast(
    ap_df,
    [(seasonal_naive, 12), (auto_arima, 12)],
    freq='M',
)
ap_ci = fcst.forecast(12, level=(80, 95))
ap_ci.set_index('ds').plot(marker='.', figsize=(10, 6))

In [None]:
#hide
#test number of jobs greater than the available cores
if __name__=="__main__" and not IN_NOTEBOOK:
    ap_df = pd.DataFrame({'ds': np.arange(ap.size), 'y': ap}, index=pd.Index([0] * ap.size, name='unique_id'))
    fcst = StatsForecast(
        ap_df,
        [(seasonal_naive, 12), (auto_arima, 12)],
        freq='M',
        n_jobs=101
    )
    ap_ci = fcst.forecast(12, level=(80, 95))
    ap_ci.set_index('ds').plot(marker='.', figsize=(10, 6))

In [None]:
#hide
# The following cells contain parallel backends
# we should create a separate submodule for this
# for the moment we skip them from docs

In [None]:
#exporti
class ParallelBackend:
    def forecast(self, df, models, freq, **kwargs: Any) -> Any:
        model = StatsForecast(df.set_index("unique_id"), models, freq)
        return model.forecast(**kwargs)

    def cross_validation(self, df, models, freq, **kwargs: Any) -> Any:
        model = StatsForecast(df.set_index("unique_id"), models, freq)
        return model.cross_validation(**kwargs)

In [None]:
#exporti
class MultiprocessBackend(ParallelBackend):
    def __init__(self, n_jobs: int) -> None:
        self.n_jobs = n_jobs
        super().__init__()

    def forecast(self, df, models, freq, **kwargs: Any) -> Any:
        model = StatsForecast(df.set_index("unique_id"), models, freq, n_jobs=self.n_jobs)
        return model.forecast(**kwargs)

    def cross_validation(self, df, models, freq, **kwargs: Any) -> Any:
        model = StatsForecast(df.set_index("unique_id"), models, freq, n_jobs=self.n_jobs)
        return model.cross_validation(**kwargs)

In [None]:
#exporti
class RayBackend(ParallelBackend):
    def __init__(self, ray_address) -> None:
        self.ray_address = ray_address

    def forecast(self, df, models, freq, **kwargs: Any) -> Any:
        model = StatsForecast(df.set_index("unique_id"), models, freq, ray_address=self.ray_address)
        return model.forecast(**kwargs)

    def cross_validation(self, df, models, freq, **kwargs: Any) -> Any:
        model = StatsForecast(df.set_index("unique_id"), models, freq, ray_address=self.ray_address)
        return model.cross_validation(df, models, freq, **kwargs)

In [None]:
#exporti
def forecast(
    df,
    models,
    freq,
    h,
    xreg=None,
    level=None,
    parallel: Optional["ParallelBackend"] = None,
):
    backend = parallel if parallel is not None else ParallelBackend()
    return backend.forecast(df, models, freq, h=h, xreg=xreg, level=level)

In [None]:
#exporti
def cross_validation(
    df,
    models,
    freq,
    h,
    n_windows=1,
    step_size=1,
    test_size=None,
    input_size=None,
    parallel: Optional["ParallelBackend"] = None,
):
    backend = parallel if parallel is not None else ParallelBackend()
    return backend.cross_validation(
        df,
        models,
        freq,
        h=h,
        n_windows=n_windows,
        step_size=step_size,
        test_size=test_size,
        input_size=input_size,
    )