In [1]:
# default_exp data.tsdataset

In [2]:
#hide
%load_ext autoreload
%autoreload 2

In [3]:
#export
import numpy as np
import pandas as pd
import random
import torch as t

from fastcore.foundation import patch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [4]:
#export
# TODO: resolver t_cols y X_cols duplicados t_cols se usa en dataloader X_cols para indexar con f_cols
#.      idea mantenemos solo X_cols y en el dataloader corregimos con 'y' y 'insample_mask' 
# TODO: paralelizar y mejorar _df_to_lists, probablemente Pool de multiprocessing
#.      si está balanceado el panel np reshape hace el truco <- pensar
class TimeSeriesDataset(Dataset):
    def __init__(self,
                 Y_df: pd.DataFrame,
                 X_df: pd.DataFrame=None,
                 S_df: pd.DataFrame=None,
                 mask_df: list=None,
                 f_cols: list=None):
        """
        """
        assert type(Y_df) == pd.core.frame.DataFrame
        assert all([(col in Y_df) for col in ['unique_id', 'ds', 'y']])
        if X_df is not None:
            assert type(X_df) == pd.core.frame.DataFrame
            assert all([(col in X_df) for col in ['unique_id', 'ds']])

        print('Processing dataframes ...')
        # Pandas dataframes to data lists
        ts_data, s_data, self.meta_data, self.t_cols, self.X_cols \
                         = self._df_to_lists(Y_df=Y_df, S_df=S_df, X_df=X_df, mask_df=mask_df)

        # Dataset attributes
        self.n_series   = len(ts_data)
        self.max_len    = max([len(ts['y']) for ts in ts_data])
        self.n_channels = len(self.t_cols) # y, X_cols, insample_mask and outsample_mask
        self.frequency  = pd.infer_freq(Y_df.head()['ds']) #TODO: improve, can die with head
        self.f_cols     = f_cols

        # Number of X and S features
        self.n_x = 0 if X_df is None else len(self.X_cols)
        self.n_s = 0 if S_df is None else S_df.shape[1]-1 # -1 for unique_id

        print('Creating ts tensor ...')
        # Balances panel and creates 
        # numpy  s_matrix of shape (n_series, n_s)
        # numpy ts_tensor of shape (n_series, n_channels, max_len) n_channels = y + X_cols + masks
        self.ts_tensor, self.s_matrix, self.len_series = self._create_tensor(ts_data, s_data)

    def _df_to_lists(self, Y_df, S_df, X_df, mask_df):
        """
        """
        unique_ids = Y_df['unique_id'].unique()

        if X_df is not None:
            X_cols = [col for col in X_df.columns if col not in ['unique_id','ds']]
        else:
            X_cols = []

        if S_df is not None:
            S_cols = [col for col in S_df.columns if col not in ['unique_id']]
        else:
            S_cols = []

        ts_data = []
        s_data = []
        meta_data = []
        for i, u_id in enumerate(unique_ids):
            top_row    = np.asscalar(Y_df['unique_id'].searchsorted(u_id, 'left'))
            bottom_row = np.asscalar(Y_df['unique_id'].searchsorted(u_id, 'right'))
            
            # Y values
            y_true = Y_df[top_row:bottom_row]['y'].values
            ts_data_i = {'y': y_true}
            
            # X values
            for X_col in X_cols:
                serie =  X_df[top_row:bottom_row][X_col].values
                ts_data_i[X_col] = serie

            # Mask values
            outsample_mask = mask_df[top_row:bottom_row]['mask'].values
            ts_data_i['insample_mask']  = np.ones(len(y_true))
            ts_data_i['outsample_mask'] = outsample_mask
            ts_data.append(ts_data_i)

            # S values
            s_data_i = defaultdict(list)
            for S_col in S_cols:
                s_data_i[S_col] = S_df.loc[S_df['unique_id']==u_id, S_col].values
            s_data.append(s_data_i)

            # Metadata
            last_ds_i  = Y_df[top_row:bottom_row]['ds'].max()
            meta_data_i = {'unique_id': u_id,
                           'last_ds': last_ds_i}
            meta_data.append(meta_data_i)

        t_cols = ['y'] + X_cols + ['insample_mask', 'outsample_mask']

        return ts_data, s_data, meta_data, t_cols, X_cols

    def _create_tensor(self, ts_data, s_data):
        """
        s_matrix of shape (n_series, n_s)
        ts_tensor of shape (n_series, n_channels, max_len) n_channels = y + X_cols + masks
        """
        s_matrix  = np.zeros((self.n_series, self.n_s))
        ts_tensor = np.zeros((self.n_series, self.n_channels, self.max_len))

        len_series = []
        for idx in range(self.n_series):
            # Left padded time series tensor
            # TODO: Maybe we can place according to ds
            ts_idx = np.array(list(ts_data[idx].values()))

            # ANTES
            #ts_tensor[idx, :self.t_cols.index('outsample_mask'), -ts_idx.shape[1]:] = ts_idx
            #ts_tensor[idx,  self.t_cols.index('insample_mask'), -ts_idx.shape[1]:] = 1
            
            # To avoid sampling windows without inputs available to predict we shift -1
            # outsample_mask will be completed with the train_mask, this ensures available data
            #ts_tensor[idx,  self.t_cols.index('outsample_mask'), -(ts_idx.shape[1]-1):] = 1

            # AHORA
            ts_tensor[idx, :, -ts_idx.shape[1]:] = ts_idx
            s_matrix[idx, :] = list(s_data[idx].values())
            len_series.append(ts_idx.shape[1])

        return ts_tensor, s_matrix, np.array(len_series)

    def get_meta_data_col(self, col):
        """
        """
        col_values = [x[col] for x in self.meta_data]
        return col_values

    def get_filtered_ts_tensor(self, offset, output_size, window_sampling_limit, ts_idxs=None):
        """
        Esto te da todo lo que tenga el tensor, el futuro incluido esto orque se usa exogenoas del futuro
        La mascara se hace despues
        """
        last_outsample_ds = self.max_len - offset + output_size
        first_ds = max(last_outsample_ds - window_sampling_limit - output_size, 0)
        if ts_idxs is None:
            filtered_ts_tensor = self.ts_tensor[:, :, first_ds:last_outsample_ds]
        else:
            filtered_ts_tensor = self.ts_tensor[ts_idxs, :, first_ds:last_outsample_ds]
        right_padding = max(last_outsample_ds - self.max_len, 0) #To padd with zeros if there is "nothing" to the right
        
        # ANTES
        #ts_train_mask = self.ts_train_mask[ts_idxs, first_ds:last_outsample_ds]

        assert np.sum(np.isnan(filtered_ts_tensor))<1.0, \
            f'The balanced balanced filtered_tensor has {np.sum(np.isnan(filtered_ts_tensor))} nan values'
        return filtered_ts_tensor, right_padding #ANTES, ts_train_mask

    def get_f_idxs(self, cols):
        # Check if cols are available f_cols and return the idxs
        assert all(col in self.f_cols for col in cols), f'Some variables in {cols} are not available in f_cols.'
        f_idxs = [self.X_cols.index(col) for col in cols]
        return f_idxs


# MASK example and test

In [5]:
from nixtla.data.datasets.epf import EPF, EPFInfo

def get_last_n_hours_mask_df(Y_df, n_hours):
    # Creates outsample_mask
    # train 1 validation 0

    last_df = Y_df.copy()[['unique_id', 'ds']]
    last_df.sort_values(by=['unique_id', 'ds'], inplace=True, ascending=False)
    last_df.reset_index(drop=True, inplace=True)

    last_df = last_df.groupby('unique_id').head(n_hours)
    last_df['mask'] = 0

    last_df = last_df[['unique_id', 'ds', 'mask']]

    mask_df = Y_df.merge(last_df, on=['unique_id', 'ds'], how='left')
    mask_df['mask'] = mask_df['mask'].fillna(1)    # The first len(Y)-n_hours used as train

    mask_df = mask_df[['unique_id', 'ds', 'mask']]
    mask_df.sort_values(by=['unique_id', 'ds'], inplace=True)

    assert len(mask_df)==len(Y_df), \
        f'The mask_df length {len(mask_df)} is not equal to Y_df length {len(Y_df)}'

    return mask_df

val_ds = 2 * 365
args = pd.Series({'dataset': ['NP', 'PJM', 'BE', 'FR']})

Y_df, Xt_df, S_df = EPF.load_groups(directory='data', groups=args.dataset)

mask_df = get_last_n_hours_mask_df(Y_df, n_hours=val_ds*24)
mask = mask_df['mask'].values

#print(f'Dataset: {args.dataset}')
#print("Xt_df.columns", Xt_df.columns)
print('X: time series features, of shape (#hours, #times,#features): \t' + str(Xt_df.shape))
if S_df is not None:
    print('S: static features, of shape (#series,#features+unique_id): \t' + str(S_df.shape))
print('Y: target series (in X), of shape (#hours, #times): \t \t' + str(Y_df.shape))
print("\n")

print("Train Validation splits")
print(mask_df.groupby(['mask', 'unique_id']).ds.max())
print(mask_df.groupby(['mask', 'unique_id']).ds.min())
print(f'Train insample percentage {np.round(sum(mask)/len(Y_df),2)}, \
        {sum(mask)} hours = {np.round(sum(mask)/(24*365),2)} years')
print(f'Train outsample percentage {np.round(sum(1-mask)/len(Y_df),2)}, \
        {sum(1-mask)} hours = {np.round(sum(1-mask)/(24*365),2)} years')
#Y_df.head()
print('\n')

dataset = TimeSeriesDataset(Y_df=Y_df, S_df=S_df, X_df=Xt_df, mask_df=mask_df)

X: time series features, of shape (#hours, #times,#features): 	(139776, 12)
S: static features, of shape (#series,#features+unique_id): 	(4, 5)
Y: target series (in X), of shape (#hours, #times): 	 	(139776, 3)


Train Validation splits
mask  unique_id
0.0   BE          2015-01-03 23:00:00
      FR          2015-01-03 23:00:00
      NP          2016-12-26 23:00:00
      PJM         2016-12-26 23:00:00
1.0   BE          2013-01-03 23:00:00
      FR          2013-01-03 23:00:00
      NP          2014-12-27 23:00:00
      PJM         2014-12-27 23:00:00
Name: ds, dtype: datetime64[ns]
mask  unique_id
0.0   BE          2013-01-04
      FR          2013-01-04
      NP          2014-12-28
      PJM         2014-12-28
1.0   BE          2011-01-09
      FR          2011-01-09
      NP          2013-01-01
      PJM         2013-01-01
Name: ds, dtype: datetime64[ns]
Train insample percentage 0.5,         69696.0 hours = 7.96 years
Train outsample percentage 0.5,         70080.0 hours = 8.0 years

In [6]:
dataset.t_cols

['y',
 'Exogenous1',
 'Exogenous2',
 'week_day',
 'day_0',
 'day_1',
 'day_2',
 'day_3',
 'day_4',
 'day_5',
 'day_6',
 'insample_mask',
 'outsample_mask']

In [7]:
dataset.ts_tensor[0, dataset.t_cols.index('outsample_mask'), :]

array([1., 1., 1., ..., 0., 0., 0.])

In [8]:
# Checking the insample_mask of series 0
dataset.ts_tensor[0, dataset.t_cols.index('insample_mask'), :]

array([1., 1., 1., ..., 1., 1., 1.])

In [9]:
dataset.ts_tensor[0, dataset.t_cols.index('y'), :]

array([32.54, 21.55, 15.71, ..., 42.44, 42.03, 40.91])

In [10]:
print("dataset.ts_tensor.shape", dataset.ts_tensor.shape)

dataset.ts_tensor.shape (4, 13, 34944)


In [11]:
dataset.get_filtered_ts_tensor(offset=10, output_size=12, window_sampling_limit=36)

(array([[[3.3520000e+01, 2.7500000e+01, 2.2840000e+01, ...,
          4.2440000e+01, 4.2030000e+01, 4.0910000e+01],
         [7.1196000e+04, 6.8045000e+04, 6.6880000e+04, ...,
          7.2883000e+04, 7.2926000e+04, 7.3070000e+04],
         [6.6725000e+04, 6.3084000e+04, 6.1337000e+04, ...,
          6.4515000e+04, 6.2554000e+04, 6.7342000e+04],
         ...,
         [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
         [1.0000000e+00, 1.0000000e+00, 1.0000000e+00, ...,
          1.0000000e+00, 1.0000000e+00, 1.0000000e+00],
         [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
          0.0000000e+00, 0.0000000e+00, 0.0000000e+00]],
 
        [[3.3520000e+01, 2.7500000e+01, 2.2840000e+01, ...,
          4.2440000e+01, 4.2030000e+01, 4.0910000e+01],
         [7.1196000e+04, 6.8045000e+04, 6.6880000e+04, ...,
          7.2883000e+04, 7.2926000e+04, 7.3070000e+04],
         [6.6725000e+04, 6.3084000e+04, 6.1337000e+04, 