In [1]:
# default_exp data.ontsdataset

In [2]:
#hide
%load_ext autoreload
%autoreload 2

In [3]:
#export
import numpy as np
import pandas as pd
import random
import torch as t

from fastcore.foundation import patch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [4]:
#export
class TimeSeriesDataset(Dataset):
    def __init__(self,
                 Y_df: pd.DataFrame,
                 X_df: pd.DataFrame=None,
                 f_cols: list=None,
                 S_df: pd.DataFrame=None,
                 ts_train_mask: list=None,
                 loss_weights: list=None):
        """
        """
        assert type(Y_df) == pd.core.frame.DataFrame
        assert all([(col in Y_df) for col in ['unique_id', 'ds', 'y']])
        if X_df is not None:
            assert type(X_df) == pd.core.frame.DataFrame
            assert all([(col in X_df) for col in ['unique_id', 'ds']])
            assert f_cols is not None, "Define f_cols"

        print('Processing dataframes ...')
        # Pandas dataframes to data lists
        ts_data, s_data, self.meta_data, self.t_cols, self.X_cols = self._df_to_lists(Y_df=Y_df, S_df=S_df, X_df=X_df)

        # Dataset attributes
        self.n_series   = len(ts_data)
        self.max_len    = max([len(ts['y']) for ts in ts_data])
        self.n_channels = len(self.t_cols) # y, X_cols, insample_mask and outsample_mask
        self.frequency  = pd.infer_freq(Y_df.head()['ds']) #TODO: improve, can die with head
        self.f_cols     = f_cols
        
        # Number of X and S features
        self.n_x = 0 if X_df is None else len(self.X_cols)
        self.n_s = 0 if S_df is None else S_df.shape[1]-1 # -1 for unique_id

        print('Creating ts tensor ...')
        # Balances panel and creates
        # numpy  s_matrix of shape (n_series, n_s)
        # numpy ts_tensor of shape (n_series, n_channels, max_len) n_channels = y + X_cols + masks
        self.ts_tensor, self.s_matrix, self.len_series = self._create_tensor(ts_data, s_data)
        if ts_train_mask is None: ts_train_mask = np.ones(self.max_len)
        assert len(ts_train_mask)==self.max_len, f'Outsample mask must have {self.max_len} length'
        self.ts_train_mask = ts_train_mask

        if loss_weights is None: loss_weights = np.ones(self.max_len)
        assert len(loss_weights)==self.max_len, f'Loss weights must have {self.max_len} length'
        self.loss_weights = loss_weights

    def _df_to_lists(self, Y_df, S_df, X_df):
        """
        """
        unique_ids = Y_df['unique_id'].unique()

        if X_df is not None:
            X_cols = [col for col in X_df.columns if col not in ['unique_id','ds']]
        else:
            X_cols = []

        if S_df is not None:
            S_cols = [col for col in S_df.columns if col not in ['unique_id']]
        else:
            S_cols = []

        ts_data = []
        s_data = []
        meta_data = []
        for i, u_id in enumerate(unique_ids):
            top_row = np.asscalar(Y_df['unique_id'].searchsorted(u_id, 'left'))
            bottom_row = np.asscalar(Y_df['unique_id'].searchsorted(u_id, 'right'))
            serie = Y_df[top_row:bottom_row]['y'].values
            last_ds_i = Y_df[top_row:bottom_row]['ds'].max()

            # Y values
            ts_data_i = {'y': serie}

            # X values
            for X_col in X_cols:
                serie =  X_df[top_row:bottom_row][X_col].values
                ts_data_i[X_col] = serie
            ts_data.append(ts_data_i)

            # S values
            s_data_i = defaultdict(list)
            for S_col in S_cols:
                s_data_i[S_col] = S_df.loc[S_df['unique_id']==u_id, S_col].values
            s_data.append(s_data_i)

            # Metadata
            meta_data_i = {'unique_id': u_id,
                           'last_ds': last_ds_i}
            meta_data.append(meta_data_i)

        assert X_cols[0] == 'ejecutado', 'First exogenous must be ejecutado'

        t_cols = ['y'] + X_cols + ['insample_mask', 'outsample_mask']
        X_cols = X_cols[1:] # First variable is ejecutado we skip it

        return ts_data, s_data, meta_data, t_cols, X_cols

    def _create_tensor(self, ts_data, s_data):
        """
        s_matrix of shape (n_series, n_s)
        ts_tensor of shape (n_series, n_channels, max_len) n_channels = y + X_cols + masks
        """
        s_matrix  = np.zeros((self.n_series, self.n_s))
        ts_tensor = np.zeros((self.n_series, self.n_channels, self.max_len))

        len_series = []
        for idx in range(self.n_series):
            ts_idx = np.array(list(ts_data[idx].values()))
            ts_tensor[idx, :self.t_cols.index('insample_mask'), -ts_idx.shape[1]:] = ts_idx
            ts_tensor[idx,  self.t_cols.index('insample_mask'), -ts_idx.shape[1]:] = 1

            # To avoid sampling windows without inputs available to predict we shift -1
            # outsample_mask will be completed with the train_mask, this ensures available data
            ts_tensor[idx,  self.t_cols.index('outsample_mask'), -(ts_idx.shape[1]-1):] = 1
            s_matrix[idx, :] = list(s_data[idx].values())
            len_series.append(ts_idx.shape[1])

        return ts_tensor, s_matrix, np.array(len_series)

    def get_meta_data_col(self, col):
        """
        """
        col_values = [x[col] for x in self.meta_data]
        return col_values

    def get_filtered_ts_tensor(self, offset, output_size, window_sampling_limit, ts_idxs=None):
        """
        Esto te da todo lo que tenga el tensor, el futuro incluido esto orque se usa exogenoas del futuro
        La mascara se hace despues
        """
        last_outsample_ds = self.max_len - offset + output_size
        first_ds = max(last_outsample_ds - window_sampling_limit - output_size, 0)
        if ts_idxs is None:
            filtered_ts_tensor = self.ts_tensor[:, :, first_ds:last_outsample_ds]
        else:
            filtered_ts_tensor = self.ts_tensor[ts_idxs, :, first_ds:last_outsample_ds]
        right_padding = max(last_outsample_ds - self.max_len, 0) #To padd with zeros if there is "nothing" to the right
        ts_train_mask = self.ts_train_mask[first_ds:last_outsample_ds]
        loss_weights = self.loss_weights[first_ds:last_outsample_ds]

        assert np.sum(np.isnan(filtered_ts_tensor))<1.0, \
            f'The balanced balanced filtered_tensor has {np.sum(np.isnan(filtered_ts_tensor))} nan values'
        return filtered_ts_tensor, right_padding, ts_train_mask, loss_weights

    def get_f_idxs(self, cols):
        # Check if cols are available f_cols and return the idxs
        assert all(col in self.f_cols for col in cols), f'Some variables in {cols} are not available in f_cols.'
        f_idxs = [self.X_cols.index(col) for col in cols]
        return f_idxs

In [6]:
from nixtla.data.datasets.on import load_on_data

Y_insample_df, X_insample_df, Y_outsample_df, X_outsample_df, f_cols = load_on_data('2020-11-1')
dataset = TimeSeriesDataset(Y_df=Y_insample_df, S_df=None, X_df=X_insample_df, ts_train_mask=np.ones(len(Y_insample_df)))
print("dataset.ts_tensor.shape", dataset.ts_tensor.shape)

Processing dataframes ...
Creating ts tensor ...
dataset.ts_tensor.shape (1, 332, 104641)


In [7]:
dataset.t_cols

['y',
 'ejecutado',
 'DECIMAL_HOUR',
 'DAY_OF_YEAR',
 'DAY_OF_WEEK',
 'IS_WEEKDAY',
 'IS_HOLIDAY',
 'L_1',
 'L_2',
 'L_3',
 'L_4',
 'L_5',
 'L_6',
 'L_7',
 'PE_1',
 'PD_1',
 'PS_1',
 'PE_7',
 'PD_7',
 'PS_7',
 'PD_0',
 'PS_0',
 'P_1',
 'P_2',
 'P_3',
 'P_4',
 'P_5',
 'P_6',
 'P_7',
 'AVG_1',
 'AVG_2',
 'AVG_3',
 'AVG_4',
 'AVG_5',
 'AVG_6',
 'AVG_7',
 'MOV_7',
 'MOV_14',
 'DELTA_1',
 'DELTA_7',
 'PEAK_TIME_1',
 'PEAK_TIME_2',
 'PEAK_TIME_3',
 'PEAK_TIME_4',
 'PEAK_TIME_5',
 'PEAK_TIME_6',
 'PEAK_TIME_7',
 'PE_PP1',
 'PE_PP2',
 'PE_PP3',
 'PE_PP4',
 'PE_PP5',
 'PE_PP6',
 'PE_PP7',
 'PE_PP8',
 'PE_PP9',
 'PE_PP10',
 'PE_PP11',
 'PE_PP12',
 'PE_PP13',
 'PE_PP14',
 'PE_PP15',
 'PE_PP16',
 'PE_PP17',
 'PE_PP18',
 'PE_PP19',
 'PE_PP20',
 'PE_PP21',
 'PE_PP22',
 'PE_PP23',
 'PE_PP24',
 'PE_PP25',
 'PE_PP26',
 'PE_PP27',
 'PE_PP28',
 'PE_PP29',
 'PE_PP30',
 'PE_PP31',
 'PE_PP32',
 'PE_PP33',
 'PE_PP34',
 'PE_PP35',
 'PE_PP36',
 'PE_PP37',
 'PE_PP38',
 'PE_PP39',
 'PE_PP40',
 'PE_PP41',
 'PE_PP

In [8]:
dataset.ts_train_mask

array([1., 1., 1., ..., 1., 1., 1.])

In [9]:
dataset.ts_tensor[0, dataset.t_cols.index('y'), :]

array([5545.98338, 5448.66753, 5392.92989, ..., 5886.83837, 5825.31859,
       5730.61466])

In [10]:
dataset.get_filtered_ts_tensor(offset=10, output_size=12, window_sampling_limit=36)

(array([[[6.44549894e+03, 6.42243587e+03, 6.39460725e+03, ...,
          5.88683837e+03, 5.82531859e+03, 5.73061466e+03],
         [6.42098288e+03, 6.41277336e+03, 6.41277336e+03, ...,
          5.90676623e+03, 5.90676623e+03, 5.79786651e+03],
         [1.27500000e+01, 1.30000000e+01, 1.32500000e+01, ...,
          2.35000000e+01, 2.37500000e+01, 0.00000000e+00],
         ...,
         [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
          0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
         [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
          1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
         [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
          1.00000000e+00, 1.00000000e+00, 1.00000000e+00]]]),
 2,
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 array([1., 1., 1., 1., 1., 1., 1., 1