In [1]:
# default_exp data.ts_loader_pinche_on

In [2]:
#hide
%load_ext autoreload
%autoreload 2

In [6]:
#export
import numpy as np
import pandas as pd
import random
import torch as t
import copy
from fastcore.foundation import patch
from nixtla.data.ts_dataset import TimeSeriesDataset
from collections import defaultdict

In [29]:
#export
class TimeSeriesLoader(object):
    def __init__(self,
                 ts_dataset:TimeSeriesDataset,
                 model:str,
                 offset:int,
                 window_sampling_limit: int, 
                 input_size: int,
                 output_size: int,
                 idx_to_sample_freq: int, #TODO: not active yet
                 batch_size: int,
                 train_loader: bool):
        """
        """
        self.ts_dataset = ts_dataset # Pass by reference
        self.model = model
        self.offset = offset
        self.window_sampling_limit = window_sampling_limit
        self.input_size = input_size
        self.output_size = output_size
        self.idx_to_sample_freq = idx_to_sample_freq
        self.batch_size = batch_size
        self.train_loader = train_loader
        self._is_train = True

        assert offset < self.ts_dataset.max_len, 'Offset must be smaller than max_len'

        self.window_sampling_idx = self._update_windows_sampling_idx()

    def _update_windows_sampling_idx(self):
        # Filter sampling_mask with offset and window_sampling_limit
        last_ds = self.ts_dataset.max_len - self.offset
        first_ds = max(last_ds - self.window_sampling_limit, 0)
        filtered_outsample_mask = self.ts_dataset.ts_tensor[:, self.ts_dataset.t_cols.index('outsample_mask'), first_ds:last_ds]
        filtered_ts_train_mask = self.ts_dataset.ts_train_mask[first_ds:last_ds]

        # Get indices of train/validation windows
        if self.train_loader:
            train_mask =  filtered_outsample_mask * filtered_ts_train_mask
            indices = np.argwhere(train_mask > 0)
        else:
            val_mask = filtered_outsample_mask * (1-filtered_ts_train_mask)
            indices = np.argwhere(val_mask > 0)

        #To change relative position of filtered tensor to global position
        indices[:, 1] += first_ds

        #Loop for each serie to extract window_sampling_idx
        window_sampling_idx = []
        for i in range(self.ts_dataset.n_series):
            ts_idx = indices[indices[:, 0] == i]
            window_sampling_idx.append(list(ts_idx[:, 1]))
        return window_sampling_idx

    def __iter__(self):
        while True:
            if self._is_train:
                sampled_ts_indices = np.random.randint(self.ts_dataset.n_series, size=self.batch_size)
            else:
                sampled_ts_indices = range(self.ts_dataset.n_series)

            batch_dict = defaultdict(list)
            for index in sampled_ts_indices:
                batch_i = self.__get_item__(index)
                for key in batch_i:
                    batch_dict[key].append(batch_i[key])

            batch = defaultdict(list)
            for key in batch_dict:
                batch[key] = np.stack(batch_dict[key])

            yield batch

    def __get_item__(self, index):
        if self.model == 'nbeats':
            return self._nbeats_batch(index)
        elif self.model == 'esrnn':
            assert 1<0, 'hacer esrnn'
        else:
            assert 1<0, 'error'

    def _nbeats_batch(self, index):
        insample = np.zeros((self.ts_dataset.n_channels-1, self.input_size), dtype=float)
        insample_mask = np.zeros(self.input_size)
        outsample = np.zeros((self.ts_dataset.n_channels-1, self.output_size), dtype=float)
        outsample_mask = np.zeros(self.output_size)
            
        ts = self.ts_dataset.ts_tensor[index]
        len_ts = self.ts_dataset.len_series[index]
        init_ts = max(self.ts_dataset.max_len-len_ts, self.ts_dataset.max_len-self.offset-self.window_sampling_limit) #TODO: precomputar en loader

        assert self.ts_dataset.max_len-self.offset > init_ts, f'Offset too big for serie {index}'
        if self._is_train:
            cut_point = np.random.choice(self.window_sampling_idx[index],1)[0] # Sampling from available cuts for ts "index"
        else:
            cut_point = max(self.ts_dataset.max_len-self.offset, self.input_size)
        
        insample_window = ts[:-2, max(0, cut_point - self.input_size):cut_point] # se sacan mask channels y Y outsample
        insample_window = np.delete(insample_window, 0, 0)
        insample_mask_start = min(self.input_size, cut_point - init_ts) #In case cut_point is close to init_ts, because series are padded
        insample[:, -insample_window.shape[1]:] = insample_window
        insample_mask[-insample_mask_start:] = 1.0

        if self._is_train:
            #se saca mask channel del final
            outsample_window = ts[:-2, cut_point:min(self.ts_dataset.max_len - self.offset, cut_point + self.output_size)]
            outsample_window = np.delete(outsample_window, 1, 0)

        else:
            #se saca mask channel del final
            outsample_window = ts[:-2, cut_point:min(self.ts_dataset.max_len, cut_point + self.output_size)]
            outsample_window = np.delete(outsample_window, 1, 0)

        # First mask is to filter after offset, second mask to filter ts validation
        outsample[:, :outsample_window.shape[1]] = outsample_window 
        outsample_mask[:outsample_window.shape[1]] = 1.0
        if self.train_loader:
            outsample_mask[:outsample_window.shape[1]] = outsample_mask[:outsample_window.shape[1]] * \
                                                        self.ts_dataset.ts_train_mask[cut_point:(cut_point+outsample_window.shape[1])]
        else:
            outsample_mask[:outsample_window.shape[1]] = outsample_mask[:outsample_window.shape[1]] * \
                                                        (1-self.ts_dataset.ts_train_mask[cut_point:(cut_point+outsample_window.shape[1])])

        insample_y = insample[0, :]
        insample_x_t = insample[1:, :]

        outsample_y = outsample[0, :]
        outsample_x_t = outsample[1:, :]

        x_s = self.ts_dataset.x_s[index, :]

        sample = {'insample_y':insample_y, 'insample_x_t':insample_x_t, 'insample_mask':insample_mask,
                  'outsample_y':outsample_y, 'outsample_x_t':outsample_x_t, 'outsample_mask':outsample_mask,
                  'x_s':x_s}

        return sample

    def update_offset(self, offset):
        if offset == self.offset:
            return # Avoid extra computation
        self.offset = offset

    def get_meta_data_var(self, var):
        """
        """
        return self.ts_dataset.get_meta_data_var(var)

    def get_n_variables(self):
        return self.ts_dataset.n_x_t-1, self.ts_dataset.n_s_t

    def get_n_series(self):
        return self.ts_dataset.n_series

    def get_max_len(self):
        return self.ts_dataset.max_len

    def get_n_channels(self):
        return self.ts_dataset.n_channels-1

    def get_frequency(self):
        return self.ts_dataset.frequency

    def train(self):
        self._is_train = True

    def eval(self):
        self._is_train = False

In [30]:
from nixtla.data.datasets.epf import EPF, EPFInfo
dataset = EPF.load(directory='data', group=EPFInfo.groups[0])
dataset.ts_tensor

Processing dataframes ...
Creating ts tensor ...


array([[[3.1050e+01, 3.0470e+01, 2.8920e+01, ..., 2.6820e+01,
         2.6650e+01, 2.5680e+01],
        [4.2497e+04, 4.1463e+04, 4.0812e+04, ..., 4.5471e+04,
         4.4386e+04, 4.3017e+04],
        [2.7980e+03, 2.4170e+03, 2.0360e+03, ..., 2.1290e+03,
         1.8270e+03, 1.6890e+03],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00,
         0.0000e+00, 0.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e+00, ..., 1.0000e+00,
         1.0000e+00, 1.0000e+00],
        [0.0000e+00, 1.0000e+00, 1.0000e+00, ..., 1.0000e+00,
         1.0000e+00, 1.0000e+00]]])

In [31]:
ts_loader = TimeSeriesLoader(ts_dataset= dataset,
                             model='nbeats',
                             offset = 0,
                             window_sampling_limit=1000, 
                             input_size=8,
                             output_size=4,
                             idx_to_sample_freq=1,
                             batch_size=1024,
                             train_loader=True)

In [32]:
dataloader = iter(ts_loader)
batch = next(dataloader)
insample_y = batch['insample_y']
insample_x_t = batch['insample_x_t']
insample_mask = batch['insample_mask']
outsample_x_t = batch['outsample_x_t']
outsample_y = batch['outsample_y']
outsample_mask = batch['outsample_mask']

In [33]:
insample_y

array([[48120., 52172., 52840., ..., 51408., 51353., 51506.],
       [57691., 56917., 56719., ..., 56336., 56412., 56555.],
       [55610., 56645., 57669., ..., 55793., 54120., 52557.],
       ...,
       [49373., 49872., 51044., ..., 52385., 51018., 49533.],
       [58271., 57240., 55642., ..., 49097., 47188., 46064.],
       [53185., 54105., 53209., ..., 52588., 52731., 52750.]])

In [34]:
outsample_y

array([[41.21, 41.69, 42.88, 42.87],
       [38.95, 39.64, 39.41, 37.91],
       [32.58, 31.23, 30.97, 30.19],
       ...,
       [31.45, 30.99, 30.18, 29.94],
       [30.21, 30.64, 31.12, 32.3 ],
       [33.05, 33.38, 33.6 , 32.98]])

In [27]:
outsample_mask

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       ...,
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [28]:
outsample_mask.mean()

0.9990234375