In [None]:
# default_exp data.dataloaders

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
from typing import Collection, Dict, List, Optional, Sequence, Tuple

import numpy as np
import pandas as pd
import torch

from nixtla.data.datasets import Tourism

# Dataloaders

> API details.

In [None]:
#export
def uids_ts_from_df(df: pd.DataFrame,
                    id_cols: Collection[str]) -> Tuple[Tuple[str],Tuple[np.ndarray]]:
    return zip(*((uid, serie.y.values) for uid, serie in df.groupby(id_cols, sort=False)))

In [None]:
tourism_dataset = Tourism.load('data')
tourism_dataset_yearly = tourism_dataset.get_group('Yearly')
tourism_dataset_yearly.Y.head()

100%|██████████| 356k/356k [00:00<00:00, 1.17MiB/s]
INFO:nixtla.data.datasets:Successfully downloaded 27-3-Athanasopoulos1.zip, 356334, bytes.
INFO:nixtla.data.datasets:Successfully decompressed data/tourism/datasets/27-3-Athanasopoulos1.zip


Unnamed: 0,unique_id,ds,y
0,Y1,1,25092.2284
1,Y1,2,24271.5134
2,Y1,3,25828.9883
3,Y1,4,27697.5047
4,Y1,5,27956.2276


In [None]:
unique_ids, time_series = uids_ts_from_df(tourism_dataset_yearly.Y, id_cols='unique_id')
unique_ids[:2], time_series[:2]

(('Y1', 'Y2'),
 (array([25092.2284, 24271.5134, 25828.9883, 27697.5047, 27956.2276,
         29924.4321, 30216.8321, 32613.4968, 36053.1674, 38472.7532,
         38420.894 ]),
  array([ 887896.51,  887068.98,  971549.04, 1064206.39, 1195560.94,
         1351933.55, 1372823.36, 1532533.61, 1587760.62, 1617737.85,
         1499631.11])))

In [None]:
df = tourism_dataset_yearly.Y.set_index('unique_id')
uids = df.index.unique()
ts = [df.loc[unique_id, 'y'] for unique_id in uids]

assert set(unique_ids) == set(uids)
assert all(np.allclose(time_serie, serie) for time_serie, serie in zip(time_series, ts))

In [None]:
#exporti
def _nbeats_tensors_from_time_series(time_series: Sequence[np.ndarray],
                                     input_size: int,
                                     output_size: Optional[int] = None) -> Tuple[torch.Tensor,torch.Tensor]:
    n_series = len(time_series)
    if output_size is None:
        max_timesteps = input_size
    else:
        max_ts_length = max(time_serie.size for time_serie in time_series)
        max_timesteps = max(max_ts_length, input_size + output_size)
    series = torch.zeros((n_series, max_timesteps), dtype=torch.float32)
    mask = torch.zeros((n_series, max_timesteps), dtype=torch.float32)
    for i, time_serie in enumerate(time_series):
        idx = time_serie.size
        if output_size is None:
            idx = min(idx, input_size)
            input_arr = time_serie[-idx:]
        else:
            input_arr = time_serie
        series[i, -idx:] = torch.from_numpy(input_arr)
        mask[i, -idx:] = 1

    return series, mask

In [None]:
#hide
input_size = 4
output_size = 1
series, mask = _nbeats_tensors_from_time_series(time_series, input_size, output_size)
max_ts_length = max(ts.size for ts in time_series)

assert series.shape[0] == len(time_series)
assert series.shape[1] == max(max_ts_length, input_size + output_size)
assert series.shape == mask.shape
assert np.allclose((series > 0).sum(1), mask.sum(1))
assert np.allclose(series.sum(), sum(time_serie.sum() for time_serie in time_series))

In [None]:
#export
class NBeatsDataLoader:

    def __init__(self,
                 time_series: Sequence[np.ndarray],
                 input_size: int,
                 output_size: Optional[int] = None,
                 batch_size: int = 1024,
                 shuffle: bool = False,
                 seed: int = 0):
        self.series, self.mask = _nbeats_tensors_from_time_series(time_series, input_size, output_size)
        self.input_size, self.output_size = input_size, output_size
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.rng = np.random.RandomState(seed)

    def __iter__(self):
        n_series, max_timesteps = self.series.shape
        time_indices = np.arange(max_timesteps)
        if self.shuffle:
            indices = self.rng.permutation(n_series)
        else:
            indices = np.arange(n_series)
        for i in range(0, n_series, self.batch_size):
            batch_indices = indices[i:i+self.batch_size].reshape(-1, 1)
            if self.output_size is not None:
                cut_points = self.rng.randint(self.input_size, max_timesteps-self.output_size+1, batch_indices.size)
                slices = [time_indices[cut_point-self.input_size : cut_point+self.output_size] for cut_point in cut_points]
                y = self.series[batch_indices, slices]
                mask = self.mask[batch_indices, slices]
                insample_y, outsample_y = y[:, :self.input_size], y[:, self.input_size:]
                insample_mask, outsample_mask = mask[:, :self.input_size], mask[:, self.input_size:]
                yield insample_y, insample_mask, outsample_y, outsample_mask
            else:
                y = self.series[i : i+self.batch_size]
                mask = self.mask[i : i+self.batch_size]
                yield y, mask

In [None]:
input_size = 4
output_size = 1
batch_size = 4

for shuffle in (True, False):
    train_dl = NBeatsDataLoader(time_series, input_size=input_size, output_size=output_size, 
                                     batch_size=batch_size, shuffle=shuffle)
    batch = next(iter(train_dl))
    
    n_series = len(time_series)
    max_timesteps = max(max(ts.size for ts in time_series), input_size + output_size)    
    rng = np.random.RandomState(0)
    if shuffle:
        batch_indices = rng.permutation(n_series)[:batch_size]
    else:
        batch_indices = np.arange(batch_size)
    tmp_y = torch.zeros(max_timesteps)
    tmp_mask = torch.zeros(max_timesteps)
    cut_points = rng.randint(input_size, max_timesteps-output_size+1, batch_size)

    for i, cut_point in enumerate(cut_points):
        tmp_y.zero_()
        tmp_mask.zero_()
        time_serie = time_series[batch_indices[i]]
        insample_y, insample_mask, outsample_y, outsample_mask = [_[i] for _ in batch]
        ts_size = time_serie.size
        tmp_y[-ts_size:] += time_serie
        tmp_mask[-ts_size:] += 1

        insample_slice = slice(cut_point-input_size, cut_point)
        outsample_slice = slice(cut_point, cut_point+output_size)
        assert torch.allclose(insample_y, tmp_y[insample_slice])
        assert torch.allclose(insample_mask, tmp_mask[insample_slice])
        assert torch.allclose(outsample_y, tmp_y[outsample_slice])
        assert torch.allclose(outsample_mask, tmp_mask[outsample_slice])

In [None]:
input_size = 4
batch_size = 4

for shuffle in (True, False):
    test_dl = NBeatsDataLoader(time_series, input_size=input_size, batch_size=batch_size)
    batch = next(iter(test_dl))
    
    n_series = len(time_series)
    tmp_y = torch.zeros(input_size)
    tmp_mask = torch.zeros(input_size)

    for i in range(batch_size):
        tmp_y.zero_()
        tmp_mask.zero_()
        time_serie = time_series[i]
        y, mask = [_[i] for _ in batch]
        tmp_y[-input_size:] += time_serie[-input_size:]
        tmp_mask[-input_size:] += 1

        assert torch.allclose(y, tmp_y)
        assert torch.allclose(mask, tmp_mask)