In [None]:
# default_exp data.tsloader

# TimeSeriesLoader
> Data Loader for Time Series data

In [None]:
#hide
from nbdev import *
%load_ext autoreload
%autoreload 2

In [None]:
#export
import warnings
from collections.abc import Mapping
from typing import Dict, List, Optional, Union

import numpy as np
import torch as t
from fastcore.foundation import patch
from torch.utils.data import DataLoader

from neuralforecast.data.tsdataset import TimeSeriesDataset, WindowsDataset

## Inherited `DataLoader` from `pytorch` 

In [None]:
#export
class TimeSeriesLoader(DataLoader):

    def __init__(self, dataset: Union[TimeSeriesDataset, WindowsDataset], 
                 eq_batch_size: bool = False, 
                 n_windows: Optional[int] = None,
                 **kwargs) -> 'TimeSeriesLoader':
        """Wraps the pytorch `DataLoader` with a special collate function 
        for the `TimeSeriesDataset` ouputs.
        
        The TimeSeriesDataset constructs all the trainable windows 
        of `batch_size` series. The number of windows can be greater 
        or smaller than the `batch_size`. For this reason, 
        an additional boolean parameter, `eq_batch_size` is included 
        that if `True` samples `batch_size` windows randomly, 
        while `False` returns all windows.
        
        Parameters
        ----------
        dataset: TimeSeriesDataset
            Stored time series.
        eq_batch_size: bool
            If `True` samples `batch_size` windows randomly,
            while `False` or `batch_size=None` returns all windows.
        n_windows: int
            Number of windows to sample after
            batching batch_size series.
        """
        #if 'collate_fn' in kwargs.keys():
        #    warnings.warn(
        #        'This class wraps the pytorch `DataLoader` with a '
        #        'special collate function. If you want to use yours '
        #        'simply use `DataLoader`. Removing collate_fn'
        #    )
        #    kwargs.pop('collate_fn')
            
        kwargs_ = {**kwargs, **dict(collate_fn=self._collate_fn)}
        DataLoader.__init__(self, dataset=dataset, **kwargs_)
        self.eq_batch_size = eq_batch_size
        self.n_windows = n_windows
        self.w_idxs: Optional[np.ndarray] = None

In [None]:
#export
@patch
def _check_batch_size(self: TimeSeriesLoader, batch: t.Tensor) -> t.Tensor:
    complete_batch = batch
    if self.w_idxs is not None:
        complete_batch = batch[self.w_idxs]
    
    return complete_batch

In [None]:
#export
@patch
def _collate_fn(self: TimeSeriesLoader, batch: Union[List, Dict[str, t.Tensor], t.Tensor]):
    """Special collate fn for the `TimeSeriesDataset`.
    
    Notes
    -----
    [1] Adapted from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py.
    """
    elem = batch[0]
    # if len(batch) == 1:
    #     return {key: self._check_batch_size(elem[key]) for key in elem}
    
    elem_type = type(elem)
    
    if isinstance(elem, t.Tensor):
        out = None
        if t.utils.data.get_worker_info() is not None:
            # If we're in a background process, concatenate directly into a
            # shared memory tensor to avoid an extra copy
            numel = sum([x.numel() for x in batch])
            storage = elem.storage()._new_shared(numel)
            out = elem.new(storage)
        complete_batch = t.cat(batch, out=out)
        return self._check_batch_size(complete_batch)
    
    elif isinstance(elem, Mapping):
        n_windows = [elem_['Y'].size(0) for elem_ in batch]
        n_windows = sum(n_windows)
        if self.eq_batch_size and self.batch_size is not None:
            self.w_idxs = np.random.choice(n_windows, size=self.batch_size, 
                                           replace=(n_windows < self.batch_size))
        if not self.eq_batch_size and self.n_windows is not None:
            self.w_idxs = np.random.choice(n_windows, size=self.n_windows, 
                                           replace=(n_windows < self.n_windows))
        return {key: self.collate_fn([d[key] for d in batch]) for key in elem}

    raise TypeError(f'Unknown {elem_type}')

### Tests WindowsDataset and TimeSeriesDataset

In [None]:
from neuralforecast.data.utils import create_synthetic_tsdata

In [None]:
def test_eq_batch_size(dataset, batch_size, loader_class):
    # Check returns batch_size tensors
    loader = loader_class(dataset=dataset, batch_size=batch_size, eq_batch_size=True)
    sizes = [batch['Y'].size(0) == batch_size for batch in loader]
    
    assert all(sizes), 'Unexpected batch sizes.'

In [None]:
def test_n_windows(dataset, batch_size, n_windows, loader_class):
    # Check returns batch_size tensors
    loader = loader_class(dataset=dataset, batch_size=batch_size, eq_batch_size=False, n_windows=n_windows)
    sizes = [batch['Y'].size(0) == n_windows for batch in loader]
    
    assert all(sizes), 'Unexpected n_window sizes.'

In [None]:
def test_eq_batch_size_order(dataset, batch_size, loader_class):
    #This test only works for TimeSeriesDataset class
    loader = loader_class(dataset=dataset, batch_size=batch_size, eq_batch_size=True)
    for batch in loader:
        idxs = batch['idxs']
        dataset_batch = dataset[idxs.numpy().tolist()]
        for key in batch.keys():
            assert t.equal(batch[key], dataset_batch[key]), (
                f'Batch and dataset batch differ, key {key}'
            )

#### Complete timeseries dataset

In [None]:
Y_df, X_df, S_df = create_synthetic_tsdata(sort=True)
dataset = TimeSeriesDataset(S_df=S_df, Y_df=Y_df, X_df=X_df,
                            input_size=5,
                            output_size=2)
dataloader = TimeSeriesLoader(dataset=dataset, batch_size=12, 
                              eq_batch_size=False, shuffle=True)

for batch in dataloader:
    batch

In [None]:
test_eq_batch_size(dataset, 32, TimeSeriesLoader)

In [None]:
test_eq_batch_size_order(dataset, 32, TimeSeriesLoader)

#### Windowed timeseries dataset

In [None]:
Y_df, X_df, S_df = create_synthetic_tsdata(sort=True)
dataset = WindowsDataset(S_df=S_df, Y_df=Y_df, X_df=X_df,
                         input_size=5,
                         output_size=2,
                         sample_freq=1,
                         complete_windows=False)
dataloader = TimeSeriesLoader(dataset=dataset, batch_size=12, 
                              eq_batch_size=False, shuffle=True,
                              n_windows=1024)

for batch in dataloader:
    batch

In [None]:
test_eq_batch_size(dataset, 32, TimeSeriesLoader)

In [None]:
test_n_windows(dataset, 32, 1024, TimeSeriesLoader)

## Faster implemention

In [None]:
#export
class FastTimeSeriesLoader:
    """
    A DataLoader-like object for a set of tensors that can be much faster than
    TensorDataset + DataLoader because dataloader grabs individual indices of
    the dataset and calls cat (slow).
    Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
    
    Notes
    -----
    [1] Adapted from https://github.com/hcarlens/pytorch-tabular/blob/master/fast_tensor_data_loader.py.
    """
    def __init__(self, dataset: TimeSeriesDataset, batch_size: int = 32, 
                 eq_batch_size: bool = False,
                 n_windows: Optional[int] = None,
                 shuffle: bool = False) -> 'FastTimeSeriesLoader':
        """Initialize a FastTimeSeriesLoader.
        
        The TimeSeriesDataset constructs all the trainable windows 
        of `batch_size` series. The number of windows can be greater 
        or smaller than the `batch_size`. For this reason, 
        an additional boolean parameter, `eq_batch_size` is included 
        that if `True` samples `batch_size` windows randomly, 
        while `False` returns all windows.
        
        Parameters
        -----------
        dataset: TimeSeriesDataset
            Stored time series.
        batch_size: int
            Batch size to load.
        n_windows: int
            Number of windows to sample after
            batching batch_size series.
        shuffle: bool 
            If `True`, shuffle the data *in-place* whenever an
            iterator is created out of this object.
        """
        self.dataset = dataset
        self.dataset_len = len(dataset)
        self.batch_size = batch_size
        self.eq_batch_size = eq_batch_size
        self.n_windows = n_windows
        self.shuffle = shuffle
        self.idxs = np.arange(self.dataset_len)

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
        self.w_idxs: Optional[np.ndarray] = None

In [None]:
#export
@patch
def __iter__(self: FastTimeSeriesLoader):
    if self.shuffle:
        self.idxs = np.random.permutation(self.dataset_len)

    self.i = 0
    return self

In [None]:
#export
@patch
def _check_batch_size(self: FastTimeSeriesLoader, batch: t.Tensor):
    complete_batch = batch
    if self.w_idxs is not None:
        complete_batch = batch[self.w_idxs]
    return complete_batch

In [None]:
#export
@patch
def __next__(self: FastTimeSeriesLoader):
    if self.i >= self.dataset_len:
        raise StopIteration
    idxs = self.idxs[self.i:(self.i + self.batch_size)].tolist()
    batch = self.dataset[idxs]
    self.i += self.batch_size
    
    n_windows = batch['Y'].size(0)
    if self.eq_batch_size and self.batch_size is not None:
        self.w_idxs = np.random.choice(n_windows, size=self.batch_size, 
                                       replace=(n_windows < self.batch_size))
    
    if not self.eq_batch_size and self.n_windows is not None:
        self.w_idxs = np.random.choice(n_windows, size=self.n_windows, 
                                       replace=(n_windows < self.n_windows))
    
    return {key: self._check_batch_size(batch[key]) for key in batch}

In [None]:
#export
@patch
def __len__(self: FastTimeSeriesLoader):
    return self.n_batches

### Tests WindowsDataset and TimeSeriesDataset

#### Complete timeseries dataset

In [None]:
Y_df, X_df, S_df = create_synthetic_tsdata(sort=True)
dataset = TimeSeriesDataset(S_df=S_df, Y_df=Y_df, X_df=X_df,
                            input_size=5,
                            output_size=2)
dataloader = FastTimeSeriesLoader(dataset=dataset, batch_size=12, 
                                  eq_batch_size=False, shuffle=True)

for batch in dataloader:
    batch

In [None]:
test_eq_batch_size(dataset, 32, FastTimeSeriesLoader)

In [None]:
test_eq_batch_size_order(dataset, 32, FastTimeSeriesLoader)

#### Windowed timeseries dataset

In [None]:
Y_df, X_df, S_df = create_synthetic_tsdata(sort=True)
dataset = WindowsDataset(S_df=S_df, Y_df=Y_df, X_df=X_df,
                         input_size=5,
                         output_size=2,
                         sample_freq=1,
                         complete_windows=False)
dataloader = FastTimeSeriesLoader(dataset=dataset, batch_size=12, 
                                  eq_batch_size=False, shuffle=True,
                                  n_windows=1024)

In [None]:
for batch in dataloader:
    batch

In [None]:
test_eq_batch_size(dataset, 32, FastTimeSeriesLoader)

In [None]:
test_n_windows(dataset, 32, 1024, FastTimeSeriesLoader)

### Performance comparison

In [None]:
dataloader = TimeSeriesLoader(dataset=dataset, batch_size=12, n_windows=1024, shuffle=True)
fast_dataloader = FastTimeSeriesLoader(dataset=dataset, batch_size=12, n_windows=1024, shuffle=True)

In [None]:
%timeit -n 50 -r 3  [batch for batch in dataloader]

10.5 ms ± 199 µs per loop (mean ± std. dev. of 3 runs, 50 loops each)


In [None]:
%timeit -n 50 -r 3 [batch for batch in fast_dataloader]

2.45 ms ± 105 µs per loop (mean ± std. dev. of 3 runs, 50 loops each)
