In [None]:
#| default_exp utils

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# Utils

> The `core.StatsForecast` class allows you to efficiently fit multiple `StatsForecast` models for large sets of time series. It operates with pandas DataFrame `df` that identifies individual series and datestamps with the `unique_id` and `ds` columns, and the `y` column denotes the target time series variable. To assist development, we declare useful datasets that we use throughout all `StatsForecast`'s unit tests.

In [None]:
#| export
import inspect
import math
import os
import warnings
from collections import namedtuple
from functools import wraps
from typing import Dict

import numpy as np
import pandas as pd
from numba import njit
from scipy.stats import norm

from utilsforecast.compat import DataFrame
from utilsforecast.data import generate_series as utils_generate_series

In [None]:
#| exporti
# Global variables
NOGIL = bool(os.getenv('NIXTLA_NUMBA_RELEASE_GIL', ''))
LEGACY_CACHE = bool(os.getenv('NUMBA_CACHE', ''))
if LEGACY_CACHE:
    warnings.warn(
        'The NUMBA_CACHE environment variable has been renamed to NIXTLA_NUMBA_CACHE. '
        'Please set that one instead.',
        FutureWarning,
    )
CACHE = bool(os.getenv('NIXTLA_NUMBA_CACHE', '')) or LEGACY_CACHE
results = namedtuple("results", "x fn nit simplex")

In [None]:
#| exporti
@njit(nogil=NOGIL, cache=CACHE)
def restrict_to_bounds(x, lower, upper):
    new_x = np.full_like(x, fill_value=np.nan, dtype=x.dtype)
    for i in range(x.size):
        lo = lower[i]
        up = upper[i]
        if x[i] < lo:
            new_x[i] = lo
        elif x[i] > up:
            new_x[i] = up
        else:
            new_x[i] = x[i]
    return new_x

In [None]:
#| hide
import matplotlib.pyplot as plt

from nbdev.showdoc import add_docs, show_doc
from fastcore.test import test_eq

# 1. Synthetic Panel Data

In [None]:
#| export
def generate_series(n_series: int,
                    freq: str = 'D',
                    min_length: int = 50,
                    max_length: int = 500,
                    n_static_features: int = 0,
                    equal_ends: bool = False,
                    engine:str = 'pandas', 
                    seed: int = 0) -> DataFrame:
    """Generate Synthetic Panel Series.

    Generates `n_series` of frequency `freq` of different lengths in the interval [`min_length`, `max_length`].
    If `n_static_features > 0`, then each series gets static features with random values.
    If `equal_ends == True` then all series end at the same date.

    Parameters
    ----------
    n_series : int
        Number of series for synthetic panel.
    freq : str (default='D')
        Frequency of the data, 'D' or 'M'.
    min_length : int (default=50)
        Minimum length of synthetic panel's series.
    max_length : int (default=500)
        Maximum length of synthetic panel's series.
    n_static_features : int (default=0)
        Number of static exogenous variables for synthetic panel's series.
    equal_ends : bool (default=False)
        Series should end in the same date stamp `ds`.
    engine : str (default='pandas')
        Output Dataframe type ('pandas' or 'polars').
    seed : int (default=0)
        Random seed used for generating the data.

    Returns
    -------
    pandas or polars DataFrame
        Synthetic panel with columns [`unique_id`, `ds`, `y`] and exogenous.
    """
    return utils_generate_series(
        n_series=n_series,
        freq=freq,
        min_length=min_length,
        max_length=max_length,
        n_static_features=n_static_features,
        equal_ends=equal_ends,
        engine=engine,
        seed=seed,
    )

In [None]:
show_doc(generate_series, title_level=3)

In [None]:
synthetic_panel = generate_series(n_series=2)
synthetic_panel.groupby('unique_id', observed=True).head(4)

# 2. AirPassengers Data

The classic Box & Jenkins airline data. Monthly totals of international airline passengers, 1949 to 1960.

It has been used as a reference on several forecasting libraries, since it is a series that shows clear trends and seasonalities it offers a nice opportunity to quickly showcase a model's predictions performance.

In [None]:
#| export
AirPassengers = np.array([112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
                          118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
                          114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
                          162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
                          209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
                          272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
                          302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
                          315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
                          318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
                          348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
                          362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
                          342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
                          417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
                          432.])

In [None]:
#| export
AirPassengersDF = pd.DataFrame({'unique_id': np.ones(len(AirPassengers)),
                                'ds': pd.date_range(start='1949-01-01',
                                                    periods=len(AirPassengers), freq=pd.offsets.MonthEnd()),
                                'y': AirPassengers})

In [None]:
from statsforecast.utils import AirPassengersDF

In [None]:
AirPassengersDF.head(12)

In [None]:
#We are going to plot the ARIMA predictions, and the prediction intervals.
fig, ax = plt.subplots(1, 1, figsize = (20, 7))
plot_df = AirPassengersDF.set_index('ds')

plot_df[['y']].plot(ax=ax, linewidth=2)
ax.set_title('AirPassengers Forecast', fontsize=22)
ax.set_ylabel('Monthly Passengers', fontsize=20)
ax.set_xlabel('Timestamp [t]', fontsize=20)
ax.legend(prop={'size': 15})
ax.grid()

## Model utils

In [None]:
#| exporti
def _repeat_val_seas(season_vals: np.ndarray, h: int) -> np.ndarray:
    repeats = math.ceil(h / season_vals.size)
    return np.tile(season_vals, repeats)[:h]

def _ensure_float(x: np.ndarray) -> np.ndarray:
    if x.dtype not in (np.float32, np.float64):
        x = x.astype(np.float32)
    return x

def _seasonal_naive(
    y: np.ndarray, # time series
    h: int, # forecasting horizon
    fitted: bool, #fitted values
    season_length: int, # season length
) -> Dict[str, np.ndarray]:
    y = _ensure_float(y)
    n = y.size
    season_vals = np.full(season_length, np.nan, dtype=y.dtype)
    season_samples = min(season_length, n)
    season_vals[:season_samples] = y[-season_samples:]
    out = _repeat_val_seas(season_vals=season_vals, h=h)
    fcst = {"mean": out}
    if fitted:
        fitted_vals = np.empty_like(y)
        fitted_vals[:season_length] = np.nan
        if n > season_length:
            fitted_vals[season_length:] = y[:n - season_length]
        fcst["fitted"] = fitted_vals
    return fcst

def _repeat_val(val: float, h: int) -> np.ndarray:
    return np.full(h, val)

def _naive(
    y: np.ndarray, # time series
    h: int, # forecasting horizon
    fitted: bool, # fitted values
) -> Dict[str, np.ndarray]: 
    fcst = {'mean': _repeat_val(val=y[-1], h=h)}
    if fitted:
        fitted_vals = np.full_like(y, np.nan)
        fitted_vals[1:] = np.roll(y, 1)[1:]
        fcst['fitted'] = fitted_vals
    return fcst

In [None]:
#| hide
# test seasonal naive
y = np.array([0.50187596, 0.40536128, 0.33436676, 0.27868117, 0.25251294,
       0.18961286, 0.07082107, 2.58699709, 3.06466854, 2.25150509,
       1.33027107, 0.73332616, 0.50187596, 0.40536128, 0.33436676,
       0.27868117, 0.25251294, 0.18961286, 0.07082107, 2.58699709,
       3.06466854, 2.25150509, 1.33027107, 0.73332616, 0.50187596,
       0.40536128, 0.33436676, 0.27868117, 0.25251294, 0.18961286,
       0.07082107, 2.58699709, 3.06466854, 2.25150509, 1.33027107,
       0.73332616, 0.50187596, 0.40536128, 0.33436676, 0.27868117,
       0.25251294, 0.18961286, 0.07082107, 2.58699709, 3.06466854,
       2.25150509, 1.33027107, 0.73332616, 0.50187596, 0.40536128,
       0.33436676, 0.27868117, 0.25251294, 0.18961286, 0.07082107,
       2.58699709, 3.06466854, 2.25150509, 1.33027107, 0.73332616,
       0.50187596, 0.40536128, 0.33436676, 0.27868117, 0.25251294,
       0.18961286, 0.07082107, 2.58699709, 3.06466854, 2.25150509,
       1.33027107, 0.73332616, 0.50187596, 0.40536128, 0.33436676,
       0.27868117, 0.25251294, 0.18961286, 0.07082107, 2.58699709,
       3.06466854, 2.25150509, 1.33027107, 0.73332616, 0.50187596,
       0.40536128, 0.33436676, 0.27868117, 0.25251294, 0.18961286,
       0.07082107, 2.58699709, 3.06466854, 2.25150509, 1.33027107,
       0.73332616, 0.50187596, 0.40536128, 0.33436676, 0.27868117,
       0.25251294, 0.18961286, 0.07082107, 2.58699709, 3.06466854,
       2.25150509, 1.33027107, 0.73332616, 0.50187596, 0.40536128,
       0.33436676, 0.27868117, 0.25251294, 0.18961286, 0.07082107,
       2.58699709, 3.06466854, 2.25150509, 1.33027107, 0.73332616,
       0.50187596, 0.40536128, 0.33436676, 0.27868117, 0.25251294,
       0.18961286])
seas_naive_fcst = dict(_seasonal_naive(y=y, h=12, season_length=12, fitted=True))['mean']
np.testing.assert_array_almost_equal(seas_naive_fcst, y[-12:])


y = np.array([0.05293832, 0.10395079, 0.25626143, 0.61529232, 1.08816604,
       0.54493457, 0.43415014, 0.47676606, 5.32806397, 3.00553563,
       0.04473598, 0.04920475, 0.05293832, 0.10395079, 0.25626143,
       0.61529232, 1.08816604, 0.54493457, 0.43415014, 0.47676606,
       5.32806397, 3.00553563, 0.04473598, 0.04920475, 0.05293832,
       0.10395079, 0.25626143, 0.61529232, 1.08816604, 0.54493457,
       0.43415014, 0.47676606, 5.32806397, 3.00553563, 0.04473598,
       0.04920475, 0.05293832, 0.10395079, 0.25626143, 0.61529232,
       1.08816604, 0.54493457, 0.43415014, 0.47676606, 5.32806397,
       3.00553563, 0.04473598, 0.04920475, 0.05293832, 0.10395079,
       0.25626143, 0.61529232, 1.08816604])
seas_naive_fcst = dict(_seasonal_naive(y=y, h=12, season_length=12, fitted=True))['mean']
np.testing.assert_array_almost_equal(seas_naive_fcst, y[-12:])

y = np.arange(23)
seas_naive_fcst = _seasonal_naive(y, h=12, fitted=True, season_length=12)
np.testing.assert_equal(seas_naive_fcst['fitted'], np.hstack([np.full(12, np.nan), y[:11]]))

In [None]:
#| exporti
# Functions used for calculating prediction intervals 
def _quantiles(level): 
    level = np.asarray(level)
    z = norm.ppf(0.5+level/200)   
    return z

def _calculate_intervals(out, level, h, sigmah):
    z = _quantiles(np.asarray(level))
    zz = np.repeat(z, h)
    zz = zz.reshape(z.shape[0], h)
    lower = out['mean'] - zz * sigmah
    upper = out['mean'] + zz * sigmah
    pred_int = {**{f'lo-{lv}': lower[i] for i, lv in enumerate(level)}, 
                **{f'hi-{lv}': upper[i] for i, lv in enumerate(level)}}    
    return pred_int

def _calculate_sigma(residuals, n): 
    if n>0:
        sigma = np.nansum(residuals ** 2) 
        sigma = sigma / n
        sigma = np.sqrt(sigma)
    else:
        sigma = 0
    return sigma

In [None]:
#| exporti
class ConformalIntervals:
    """Class for storing conformal intervals metadata information."""

    def __init__(
        self,
        n_windows: int = 2,
        h: int = 1,
        method: str = "conformal_distribution",
    ):
        if n_windows < 2:
            raise ValueError(
                "You need at least two windows to compute conformal intervals"
            )
        allowed_methods = ["conformal_distribution"]
        if method not in allowed_methods:
            raise ValueError(f"method must be one of {allowed_methods}")
        self.n_windows = n_windows
        self.h = h
        self.method = method

In [None]:
#| exporti
def _old_kw_to_pos(old_names, new_positions):
    def decorator(f):
        @wraps(f)
        def inner(*args, **kwargs):
            arg_names = inspect.getfullargspec(f).args
            new_args = list(args)
            for old_name, pos in zip(old_names, new_positions):
                if old_name in kwargs:
                    new_name = arg_names[pos]
                    warnings.warn(
                        f"`{old_name}` has been deprecated, please use `{new_name}` instead.",
                        FutureWarning,
                    )
                    if len(new_args) > pos:
                        new_args = [
                            *new_args[:pos],
                            kwargs[old_name],
                            *new_args[pos + 1 :],
                        ]
                    else:
                        new_args = list(new_args)
                        for i in range(len(new_args), pos):
                            new_args.append(kwargs.pop(arg_names[i]))
                        new_args.append(kwargs.pop(old_name))
            return f(*new_args, **kwargs)
        return inner
    return decorator

In [None]:
#| hide
@_old_kw_to_pos(['d', 'e'], [0, 2])
def f(a, b, c, *, d=None, e=None):
    return a + b + c

test_eq(f(1, 2, 3), 6)
test_eq(f(a=1, b=2, c=3), 6)
with warnings.catch_warnings(record=True) as issued_warnings:
    warnings.filterwarnings('always')
    test_eq(f(1, b=2, e=3), 6)
    test_eq(f(d=1, b=2, e=3), 6)
assert any('`d` has been deprecated, please use `a` instead.' in str(w.message) for w in issued_warnings)
assert any('`e` has been deprecated, please use `c` instead.' in str(w.message) for w in issued_warnings)