In [1]:
# default_exp data.tsloader_pinche

In [2]:
#hide
%load_ext autoreload
%autoreload 2

In [3]:
#export
import numpy as np
import pandas as pd
import random
import torch as t
import copy
from fastcore.foundation import patch
from nixtla.data.tsdataset import TimeSeriesDataset
from collections import defaultdict

In [4]:
#export
#TODO: pensar como mandar batches por epochs en 
class TimeSeriesLoader(object):
    def __init__(self,
                 ts_dataset:TimeSeriesDataset,
                 model:str,
                 offset:int,
                 window_sampling_limit: int, 
                 input_size: int,
                 output_size: int,
                 idx_to_sample_freq: int,
                 batch_size: int,
                 is_train_loader: bool):
        """
        """
        # Dataloader attributes
        self.model = model
        self.window_sampling_limit = window_sampling_limit
        self.input_size = input_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.idx_to_sample_freq = idx_to_sample_freq
        self.offset = offset
        self.ts_dataset = ts_dataset
        self.t_cols = self.ts_dataset.t_cols        
        self.is_train_loader = is_train_loader
        self._is_train = True

        # Dataloader protections
        assert offset < self.ts_dataset.max_len, \
            f'Offset {offset} must be smaller than max_len {self.ts_dataset.max_len}'

        self.window_sampling_idx = self._update_windows_sampling_idx()
        
        #TODO: cambiar estos prints
        # print('X: time series features, of shape (#series,#times,#features): \t' + str(X.shape))
        # print('Y: target series (in X), of shape (#series,#times): \t \t' + str(Y.shape))
        # print('S: static features, of shape (#series,#features): \t \t' + str(S.shape))

    def _update_windows_sampling_idx(self):
        # Filter sampling_mask with offset and window_sampling_limit
        last_ds = self.ts_dataset.max_len - self.offset
        first_ds = max(last_ds - self.window_sampling_limit, 0)
        filtered_outsample_mask = self.ts_dataset.ts_tensor[:, self.ts_dataset.t_cols.index('outsample_mask'), first_ds:last_ds]
        filtered_ts_train_mask = self.ts_dataset.ts_train_mask[first_ds:last_ds]

        # Get indices of train/validation windows
        if self.is_train_loader:
            train_mask =  filtered_outsample_mask * filtered_ts_train_mask
            indices = np.argwhere(train_mask > 0)
        else:
            val_mask = filtered_outsample_mask * (1-filtered_ts_train_mask)
            indices = np.argwhere(val_mask > 0)

        #To change relative position of filtered tensor to global position
        indices[:, 1] += first_ds

        #Loop for each serie to extract window_sampling_idx
        window_sampling_idx = []
        for i in range(self.ts_dataset.n_series):
            ts_idx = indices[indices[:, 0] == i]
            window_sampling_idx.append(list(ts_idx[:, 1]))
        return window_sampling_idx

    def __iter__(self):
        while True:
            if self._is_train:
                sampled_ts_indices = np.random.randint(self.ts_dataset.n_series, size=self.batch_size)
            else:
                sampled_ts_indices = range(self.ts_dataset.n_series)

            batch_dict = defaultdict(list)
            for index in sampled_ts_indices:
                batch_i = self.__get_item__(index)
                for key in batch_i:
                    batch_dict[key].append(batch_i[key])

            batch = defaultdict(list)
            for key in batch_dict:
                batch[key] = t.Tensor(np.stack(batch_dict[key]))

            yield batch

    def __get_item__(self, index):
        if self.model == 'nbeats':
            return self._nbeats_batch(index)
        elif self.model == 'esrnn':
            assert 1<0, 'hacer esrnn'
        else:
            assert 1<0, 'error'

    def _nbeats_batch(self, index):
        # y, X_cols, insample_mask and outsample_mask - 2 masks
        insample = np.zeros((self.ts_dataset.n_channels-2, self.input_size), dtype=float)
        insample_mask = np.zeros(self.input_size)
        outsample = np.zeros((self.ts_dataset.n_channels-2, self.output_size), dtype=float)
        outsample_mask = np.zeros(self.output_size)
            
        ts = self.ts_dataset.ts_tensor[index]
        len_ts = self.ts_dataset.len_series[index]
        init_ts = max(self.ts_dataset.max_len-len_ts, self.ts_dataset.max_len-self.offset-self.window_sampling_limit) #TODO: precomputar en loader

        assert self.ts_dataset.max_len-self.offset > init_ts, f'Offset too big for serie {index}'
        if self._is_train:
            cut_point = np.random.choice(self.window_sampling_idx[index],1)[0] # Sampling from available cuts for ts "index"
        else:
            cut_point = max(self.ts_dataset.max_len-self.offset, self.input_size)
        
        insample_window = ts[:self.t_cols.index('insample_mask'), max(0, cut_point - self.input_size):cut_point] # se saca mask channel del final
        insample_mask_start = min(self.input_size, cut_point - init_ts) #In case cut_point is close to init_ts, because series are padded
        # print('ts', ts)
        # print('insample', insample)
        # print('insample_window', insample_window)
        # print('self.window_sampling_idx[index]',self.window_sampling_idx[index])
        # print('cut_point', cut_point)
        # print('----')
        insample[:, -insample_window.shape[1]:] = insample_window
        insample_mask[-insample_mask_start:] = 1.0

        if self._is_train:
            #se saca mask channel del final
            outsample_window = ts[:self.t_cols.index('insample_mask'), cut_point:min(self.ts_dataset.max_len - self.offset, cut_point + self.output_size)]
        else:
            #se saca mask channel del final
            outsample_window = ts[:self.t_cols.index('insample_mask'), cut_point:min(self.ts_dataset.max_len, cut_point + self.output_size)]

        # First mask is to filter after offset, second mask to filter ts validation
        outsample[:, :outsample_window.shape[1]] = outsample_window 
        outsample_mask[:outsample_window.shape[1]] = 1.0
        outsample_mask[:outsample_window.shape[1]] = outsample_mask[:outsample_window.shape[1]] * \
                                                     self.ts_dataset.ts_train_mask[cut_point:(cut_point+outsample_window.shape[1])]

        insample_y = insample[self.t_cols.index('y'), :]
        insample_x = insample[1:, :]

        outsample_y = outsample[self.t_cols.index('y'), :]
        outsample_x = outsample[1:, :]

        s_matrix = self.ts_dataset.s_matrix[index, :]

        batch = {'s_matrix': s_matrix,
                 'insample_y': insample_y, 'insample_x':insample_x, 'insample_mask':insample_mask,
                 'outsample_y': outsample_y, 'outsample_x':outsample_x, 'outsample_mask':outsample_mask}
        return batch

    def update_offset(self, offset):
        if offset == self.offset:
            return # Avoid extra computation
        self.offset = offset
        self._create_train_data()

    def get_meta_data_col(self, col):
        return self.ts_dataset.get_meta_data_col(col)

    def get_n_variables(self):
        return self.ts_dataset.n_x, self.ts_dataset.n_s

    def get_n_series(self):
        return self.ts_dataset.n_series

    def get_max_len(self):
        return self.ts_dataset.max_len

    def get_n_channels(self):
        return self.ts_dataset.n_channels

    def get_X_cols(self):
        return self.ts_dataset.X_cols

    def get_frequency(self):
        return self.ts_dataset.frequency

    def train(self):
        self._is_train = True

    def eval(self):
        self._is_train = False

In [5]:
from nixtla.data.datasets.epf import EPF, EPFInfo
Y_df, X_df = EPF.load(directory='data', group=EPFInfo.groups[0])
print("EPFInfo.groups[0]", EPFInfo.groups[0])
print("Y_df.shape", Y_df.shape)
print("X_df.shape", X_df.shape)

EPFInfo.groups[0] NP
Y_df.shape (34944, 3)
X_df.shape (34944, 11)


In [6]:
train_outsample_mask = np.ones(len(Y_df))
train_outsample_mask[-365 * 24:] = 0
print("sum(train_outsample_mask)", sum(train_outsample_mask))
epf_dataset = TimeSeriesDataset(Y_df=Y_df, S_df=None, X_df=X_df, ts_train_mask=train_outsample_mask)

sum(train_outsample_mask) 26184.0
Processing dataframes ...
Creating ts tensor ...


In [7]:
ts_loader = TimeSeriesLoader(ts_dataset=epf_dataset,
                             model='nbeats',
                             offset=0,
                             window_sampling_limit=365*4*24, 
                             input_size=3*24,
                             output_size=24,
                             idx_to_sample_freq=1,
                             batch_size= 2048,
                             is_train_loader=True)

In [8]:
import time 

start = time.time()
dataloader = iter(ts_loader)
batch = next(dataloader)
insample_y = batch['insample_y']
insample_x = batch['insample_x']
insample_mask = batch['insample_mask']
outsample_y = batch['outsample_y']
outsample_x = batch['outsample_x']
outsample_mask = batch['outsample_mask']

print("DataloaderPinche batch time:", time.time()-start)
print("epf_dataset.t_cols\n", epf_dataset.t_cols)
print("ts_loader.input_size", ts_loader.input_size)
print("ts_loader.output_size", ts_loader.output_size)
print("insample_y.shape", insample_y.shape)
print("insample_x.shape", insample_x.shape)
print("outsample_y.shape", outsample_y.shape)
print("outsample_x.shape", outsample_x.shape)

DataloaderPinche batch time: 4.033509969711304
epf_dataset.t_cols
 ['y', 'Exogenous1', 'Exogenous2', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'insample_mask', 'outsample_mask']
ts_loader.input_size 72
ts_loader.output_size 24
insample_y.shape torch.Size([2048, 72])
insample_x.shape torch.Size([2048, 9, 72])
outsample_y.shape torch.Size([2048, 24])
outsample_x.shape torch.Size([2048, 9, 24])


In [None]:
insample_y

In [None]:
outsample_y

In [None]:
outsample_mask

In [None]:
outsample_mask.mean()