In [1]:
#default_exp utils

In [2]:
#hide
%load_ext autoreload
%autoreload 2

# Utils

> Common utilities.

In [3]:
#export
import random
from itertools import chain

import numpy as np
import pandas as pd

In [4]:
#export
def generate_series(
    n_series: int,
    freq: str = 'D',
    min_length: int = 50,
    max_length: int = 500,
    n_static_features: int = 0,
    equal_ends: bool = False,
    seed: int = 0,
) -> pd.DataFrame:
    """Generates `n_series` of frequency `freq` of different lengths in the interval [`min_length`, `max_length`].
    If `n_static_features > 0`, then each serie gets static features with random values.
    If `equal_ends == True` then all series end at the same date."""
    seasonalities = {'D': 7, 'M': 12}
    season = seasonalities[freq]
    
    rng = np.random.RandomState(seed)
    series_lengths = rng.randint(min_length, max_length + 1, n_series)
    total_length = series_lengths.sum()

    dates = pd.date_range('2000-01-01', periods=max_length, freq=freq).values
    uids = [
        np.repeat(i, serie_length) for i, serie_length in enumerate(series_lengths)
    ]
    if equal_ends:
        ds = [dates[-serie_length:] for serie_length in series_lengths]
    else:
        ds = [dates[:serie_length] for serie_length in series_lengths]
    y = np.arange(total_length) % season + rng.rand(total_length) * 0.5
    series = pd.DataFrame(
        {
            'unique_id': chain.from_iterable(uids),
            'ds': chain.from_iterable(ds),
            'y': y,
        }
    )
    for i in range(n_static_features):
        random.seed(seed)
        static_values = [
            [random.randint(0, 100)] * serie_length for serie_length in series_lengths
        ]
        series[f'static_{i}'] = np.hstack(chain.from_iterable(static_values))
        series[f'static_{i}'] = series[f'static_{i}'].astype('category')
        if i == 0:
            series['y'] = series['y'] * (1 + series[f'static_{i}'].cat.codes)
    series['unique_id'] = series['unique_id'].astype('category')
    series['unique_id'] = series['unique_id'].cat.as_ordered()
    series = series.set_index('unique_id')
    return series

In [5]:
data = generate_series(100)
data

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-01-01,0.497650
0,2000-01-02,1.290925
0,2000-01-03,2.207184
0,2000-01-04,3.237349
0,2000-01-05,4.311755
...,...,...
99,2000-06-25,6.477150
99,2000-06-26,0.431850
99,2000-06-27,1.447339
99,2000-06-28,2.081776


In [6]:
monthly_data = generate_series(100, freq='M')
monthly_data

Unnamed: 0_level_0,ds,y
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2000-01-31,0.497650
0,2000-02-29,1.290925
0,2000-03-31,2.207184
0,2000-04-30,3.237349
0,2000-05-31,4.311755
...,...,...
99,2014-09-30,10.477150
99,2014-10-31,11.431850
99,2014-11-30,0.447339
99,2014-12-31,1.081776
