In [1]:
# default_exp data.ts_dataset

In [2]:
#hide
%load_ext autoreload
%autoreload 2

In [3]:
#export
import numpy as np
import pandas as pd
import random
import torch as t

from fastcore.foundation import patch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [4]:
#export
class TimeSeriesDataset(Dataset):
    def __init__(self, #TODO: poner hint the tipo
                 y_df,
                 output_size = None,
                 X_t_df = None,
                 X_s_df = None,
                 ts_train_mask = None):
        """
        """
        assert type(y_df) == pd.core.frame.DataFrame
        assert all([(col in y_df) for col in ['unique_id', 'ds', 'y']])
        if X_t_df is not None:
            assert type(X_t_df) == pd.core.frame.DataFrame
            assert all([(col in X_t_df) for col in ['unique_id', 'ds']])

        print('Processing dataframes ...')
        ts_data, x_s, self.meta_data, self.t_cols = self._df_to_lists(y_df=y_df, X_s_df=X_s_df, X_t_df=X_t_df)

        # Attributes
        self.output_size = output_size
        self.n_series = len(ts_data)
        self.max_len = max([len(ts['y']) for ts in ts_data])
        self.n_channels = len(ts_data[0].values())
        self.frequency = pd.infer_freq(y_df.head()['ds']) #TODO: improve, can die with head

        self.n_x_t, self.n_s_t = 0, 0
        if X_t_df is not None:
            self.n_x_t = X_t_df.shape[1]-2 # 2 for unique_id and ds
        if X_s_df is not None:
            self.n_s_t = X_s_df.shape[1]-1 # 1 for unique_id

        print('Creating ts tensor ...')
        self.ts_tensor, self.x_s, self.len_series = self._create_tensor(ts_data, x_s)

        if ts_train_mask is not None:
            assert len(ts_train_mask)==self.max_len, f'Outsample mask must have length {self.max_len}'
        else:
            ts_train_mask = np.ones(self.max_len)

        self._declare_outsample_train_mask(ts_train_mask)


    def _df_to_lists(self, y_df, X_s_df, X_t_df):
        """
        """
        unique_ids = y_df['unique_id'].unique()

        if X_t_df is not None:
            X_t_vars = [col for col in X_t_df.columns if col not in ['unique_id','ds']]
        else:
            X_t_vars = []

        if X_s_df is not None:
            X_s_vars = [col for col in X_s_df.columns if col not in ['unique_id']]
        else:
            X_s_vars = []

        ts_data = []
        x_s = []
        meta_data = []
        for i, u_id in enumerate(unique_ids):
            top_row = np.asscalar(y_df['unique_id'].searchsorted(u_id, 'left'))
            bottom_row = np.asscalar(y_df['unique_id'].searchsorted(u_id, 'right'))
            serie = y_df[top_row:bottom_row]['y'].values
            last_ds_i = y_df[top_row:bottom_row]['ds'].max()
            
            # Y values
            ts_data_i = {'y': serie}
            # X_t values
            for X_t_var in X_t_vars:
                serie =  X_t_df[top_row:bottom_row][X_t_var].values
                ts_data_i[X_t_var] = serie
            ts_data.append(ts_data_i)

            # Static data
            s_data_i = defaultdict(list)
            for X_s_var in X_s_vars:
                s_data_i[X_s_var] = X_s_df.loc[X_s_df['unique_id']==u_id, X_s_var].values
            x_s.append(s_data_i)

            # Metadata
            meta_data_i = {'unique_id': u_id,
                           'last_ds': last_ds_i}
            meta_data.append(meta_data_i)

        t_cols = ['y'] + X_t_vars + ['insample_mask', 'outsample_mask']

        return ts_data, x_s, meta_data, t_cols

    def _create_tensor(self, ts_data, x_s):
        """
        ts_tensor: n_series x n_channels x max_len
        """
        ts_tensor = np.zeros((self.n_series, self.n_channels + 2, self.max_len)) # +2 for the masks
        static_tensor = np.zeros((self.n_series, len(x_s[0])))

        len_series = []
        for idx in range(self.n_series):
            ts_idx = np.array(list(ts_data[idx].values()))
            ts_tensor[idx, :self.t_cols.index('insample_mask'), -ts_idx.shape[1]:] = ts_idx
            ts_tensor[idx, self.t_cols.index('insample_mask'), -ts_idx.shape[1]:] = 1
            # To avoid sampling windows without inputs to predict
            # Outsample mask will be later completed with the 'train_mask'
            ts_tensor[idx, self.t_cols.index('outsample_mask'), -(ts_idx.shape[1]-1):] = 1
            static_tensor[idx, :] = list(x_s[idx].values())
            len_series.append(ts_idx.shape[1])

        return ts_tensor, static_tensor, np.array(len_series)

    def _declare_outsample_train_mask(self, ts_train_mask):
        # Update attribute and ts_tensor
        self.ts_train_mask = ts_train_mask
        # Broadcasting self.ts_train_mask to n_series to get outsample_mask = (outsample_mask AND train_mask)
        #outsample_train_mask = self.ts_tensor[:, self.t_cols.index('outsample_mask'), :] * self.ts_train_mask
        #tensor_validation_mask = self.ts_tensor[:, self.t_cols.index('outsample_mask'), :] * (1-self.ts_train_mask)
        #self.ts_tensor[:, self.t_cols.index('outsample_mask'), :] = outsample_train_mask

        # Array with max of outsample mask per window
        # self.train_sampling_mask = np.zeros((self.n_series, self.max_len))
        # self.validation_sampling_mask = np.zeros((self.n_series, self.max_len))
        # for i in range(1, self.max_len): #0 is not sampleable
        #     train_outsample_mask = tensor_train_mask[:, i:(i + self.output_size)]
        #     self.train_sampling_mask[:,i] = np.max(train_outsample_mask, axis=1)

        #     validation_outsample_mask = tensor_validation_mask[:, i:(i + self.output_size)]
        #     self.validation_sampling_mask[:,i] = np.max(validation_outsample_mask, axis=1)

    def get_meta_data_var(self, var):
        """
        """
        var_values = [x[var] for x in self.meta_data]
        return var_values

    def get_filtered_tensor(self, offset, output_size, window_sampling_limit, ts_idxs=None):
        """
        Esto te da todo lo que tenga el tensor, el futuro incluido esto orque se usa exogenoas del futuro
        La mascara se hace despues
        """
        last_outsample_ds = self.max_len - offset + output_size
        first_ds = max(last_outsample_ds - window_sampling_limit - output_size, 0)
        if ts_idxs is None:
            filtered_tensor = self.ts_tensor[:, :, first_ds:last_outsample_ds]
        else:
            filtered_tensor = self.ts_tensor[ts_idxs, :, first_ds:last_outsample_ds]
        right_padding = max(last_outsample_ds - self.max_len, 0) #To padd with zeros if there is "nothing" to the right
        return filtered_tensor, right_padding

In [5]:
from nixtla.data.datasets.tourism import Tourism, TourismInfo
tourism_dataset = Tourism.load(directory='data', group=TourismInfo.groups[0])
tourism_dataset.ts_tensor

In [8]:
tourism_dataset.t_cols

['y', 'insample_mask', 'outsample_mask']

In [9]:
tourism_dataset.ts_train_mask

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [14]:
tourism_dataset.ts_tensor[0,-1,:]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [10]:
tourism_dataset.ts_tensor[0,0,:]

array([    0.    ,     0.    ,     0.    ,     0.    ,     0.    ,
           0.    ,     0.    ,     0.    ,     0.    ,     0.    ,
           0.    ,     0.    ,     0.    ,     0.    ,     0.    ,
           0.    ,     0.    ,     0.    ,     0.    ,     0.    ,
           0.    ,     0.    ,     0.    ,     0.    ,     0.    ,
           0.    ,     0.    ,     0.    ,     0.    ,     0.    ,
           0.    ,     0.    , 25092.2284, 24271.5134, 25828.9883,
       27697.5047, 27956.2276, 29924.4321, 30216.8321, 32613.4968,
       36053.1674, 38472.7532, 38420.894 , 36555.6156, 37385.6371,
       38431.9699, 40345.33  ])

In [17]:
tourism_dataset.train_sampling_mask[0,:]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
# def rolling_sum(a, window_size=4) :
#     ret = np.cumsum(np.concatenate((M, np.zeros(shape=(len(M), window_size-1))), axis=1), 
#                     axis=1, dtype=float)
#     ret[:, window_size:] = ret[:, window_size:] - ret[:, :-window_size]
#     return ret[:, window_size-1:]

# M = np.array([[0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  1.],
#               [0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
#               [1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.]])

# print(rolling_sum(M))