In [None]:
# default_exp data.tsdataset

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#export
import numpy as np
import pandas as pd
import random
import torch as t

from fastcore.foundation import patch
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [None]:
#export
# TODO: resolver t_cols y X_cols duplicados t_cols se usa en dataloader X_cols para indexar con f_cols
#.      idea mantenemos solo X_cols y en el dataloader corregimos con 'y' y 'insample_mask' 
# TODO: paralelizar y mejorar _df_to_lists, probablemente Pool de multiprocessing
#.      si está balanceado el panel np reshape hace el truco <- pensar
class TimeSeriesDataset(Dataset):
    def __init__(self,
                 Y_df: pd.DataFrame,
                 X_df: pd.DataFrame=None,
                 S_df: pd.DataFrame=None,
                 mask_df: pd.DataFrame=None,
                 f_cols: list=None):
        """
        """
        assert type(Y_df) == pd.core.frame.DataFrame
        assert all([(col in Y_df) for col in ['unique_id', 'ds', 'y']])
        if X_df is not None:
            assert type(X_df) == pd.core.frame.DataFrame
            assert all([(col in X_df) for col in ['unique_id', 'ds']])
            assert len(Y_df)==len(X_df), f'The dimensions of Y_df and X_df are not the same'
        assert len(Y_df)==len(mask_df), f'The dimensions of Y_df and mask_df are not the same'

        print("Train Validation splits")
        mask_df['train_mask'] = mask_df['available_mask'] * mask_df['sample_mask']
        self.n_tstamps = len(mask_df)
        self.n_avl = mask_df.available_mask.sum()        
        self.n_trn = mask_df.train_mask.sum()
        self.n_prd = len(mask_df)-mask_df.sample_mask.sum()

        avl_prc = np.round(self.n_avl/self.n_tstamps,5)
        trn_prc = np.round(self.n_trn/self.n_tstamps,5)
        prd_prc = np.round(self.n_prd/self.n_tstamps,5)
        print(mask_df.groupby(['unique_id', 'sample_mask']).agg({'ds': ['min', 'max']}))
        print(f'Total data \t\t\t{self.n_tstamps} time stamps')
        print(f'Available prc = {avl_prc}, \t{self.n_avl} time stamps')
        print(f'Train prc = {trn_prc}, \t\t{self.n_trn} time stamps')
        print(f'Predict prc = {prd_prc}, \t\t{self.n_prd} time stamps')
        print('\n')

        #print('\n')
        #print('Processing dataframes ...')
        #Pandas dataframes to data lists
        if mask_df is None:
            mask_df = Y_df[['unique_id', 'ds']].copy()
            mask_df['available_mask'] = np.ones(len(Y_df))
            mask_df['sample_mask'] = np.ones(len(Y_df))

        ts_data, s_data, self.meta_data, self.t_cols, self.X_cols \
                         = self._df_to_lists(Y_df=Y_df, S_df=S_df, X_df=X_df, mask_df=mask_df)

        # Dataset attributes
        self.n_series   = len(ts_data)
        self.max_len    = max([len(ts['y']) for ts in ts_data])
        self.n_channels = len(self.t_cols) # y, X_cols, insample_mask and outsample_mask
        self.frequency  = pd.infer_freq(Y_df.head()['ds']) #TODO: improve, can die with head
        self.f_cols     = f_cols

        # Number of X and S features
        self.n_x = 0 if X_df is None else len(self.X_cols)
        self.n_s = 0 if S_df is None else S_df.shape[1]-1 # -1 for unique_id

        # print('Creating ts tensor ...')
        # Balances panel and creates 
        # numpy  s_matrix of shape (n_series, n_s)
        # numpy ts_tensor of shape (n_series, n_channels, max_len) n_channels = y + X_cols + masks
        self.ts_tensor, self.s_matrix, self.len_series = self._create_tensor(ts_data, s_data)

    def _df_to_lists(self, Y_df, S_df, X_df, mask_df):
        """
        """
        unique_ids = Y_df['unique_id'].unique()

        if X_df is not None:
            X_cols = [col for col in X_df.columns if col not in ['unique_id','ds']]
        else:
            X_cols = []

        if S_df is not None:
            S_cols = [col for col in S_df.columns if col not in ['unique_id']]
        else:
            S_cols = []

        ts_data = []
        s_data = []
        meta_data = []
        for i, u_id in enumerate(unique_ids):
            top_row    = np.asscalar(Y_df['unique_id'].searchsorted(u_id, 'left'))
            bottom_row = np.asscalar(Y_df['unique_id'].searchsorted(u_id, 'right'))
            
            # Y values
            y_true = Y_df[top_row:bottom_row]['y'].values
            ts_data_i = {'y': y_true}
            
            # X values
            for X_col in X_cols:
                serie =  X_df[top_row:bottom_row][X_col].values
                ts_data_i[X_col] = serie

            # Mask values
            available_mask = mask_df[top_row:bottom_row]['available_mask'].values
            sample_mask = mask_df[top_row:bottom_row]['sample_mask'].values            
            ts_data_i['available_mask'] = available_mask
            ts_data_i['sample_mask']  = sample_mask
            ts_data.append(ts_data_i)

            # S values
            s_data_i = defaultdict(list)
            for S_col in S_cols:
                s_data_i[S_col] = S_df.loc[S_df['unique_id']==u_id, S_col].values
            s_data.append(s_data_i)

            # Metadata
            last_ds_i  = Y_df[top_row:bottom_row]['ds'].max()
            meta_data_i = {'unique_id': u_id,
                           'last_ds': last_ds_i}
            meta_data.append(meta_data_i)

        #for tss in ts_data:
        #    print("tss['y'].shape", tss['y'].shape)
        #    for X_col in X_cols:
        #        print("tss[X_col].shape", tss[X_col].shape)

        t_cols = ['y'] + X_cols + ['available_mask', 'sample_mask']

        return ts_data, s_data, meta_data, t_cols, X_cols

    def _create_tensor(self, ts_data, s_data):
        """
        s_matrix of shape (n_series, n_s)
        ts_tensor of shape (n_series, n_channels, max_len) n_channels = y + X_cols + masks
        """
        s_matrix  = np.zeros((self.n_series, self.n_s))
        ts_tensor = np.zeros((self.n_series, self.n_channels, self.max_len))

        print("ts_tensor.shape", ts_tensor.shape)

        len_series = []
        for idx in range(self.n_series):
            # Left padded time series tensor
            # TODO: Maybe we can place according to ds
            ts_idx = np.array(list(ts_data[idx].values()))

            print("ts_idx.shape", ts_idx.shape)

            # ANTES
            #ts_tensor[idx, :self.t_cols.index('outsample_mask'), -ts_idx.shape[1]:] = ts_idx
            #ts_tensor[idx,  self.t_cols.index('insample_mask'), -ts_idx.shape[1]:] = 1
            
            # To avoid sampling windows without inputs available to predict we shift -1
            # outsample_mask will be completed with the train_mask, this ensures available data
            #ts_tensor[idx,  self.t_cols.index('outsample_mask'), -(ts_idx.shape[1]-1):] = 1

            # AHORA
            ts_tensor[idx, :, -ts_idx.shape[1]:] = ts_idx
            s_matrix[idx, :] = list(s_data[idx].values())
            len_series.append(ts_idx.shape[1])

            # ###########
            # ###########
            # ###########
            # print("\n")
            # markets = ['BE', 'FR', 'NP', 'PJM']
            # available_mask = ts_tensor[idx, self.t_cols.index('available_mask'), :]
            # sample_mask = ts_tensor[idx, self.t_cols.index('sample_mask'), :]
            # train_mask = available_mask * sample_mask
            # n_hours = len(available_mask)

            # market = markets[idx]
            # print(f'DATASET {market} Available Mask {np.round(np.sum(available_mask/n_hours),5)}')
            # print(f'DATASET {market} Sample Mask {np.round(np.sum(sample_mask/n_hours),5)}')
            # print(f'DATASET {market} Train Mask {np.round(np.sum(train_mask/n_hours),5)}')
            # ###########
            # ###########
            # ###########

        return ts_tensor, s_matrix, np.array(len_series)

    def get_meta_data_col(self, col):
        """
        """
        col_values = [x[col] for x in self.meta_data]
        return col_values

    def get_filtered_ts_tensor(self, offset, output_size, window_sampling_limit, ts_idxs=None):
        """
        Esto te da todo lo que tenga el tensor, el futuro incluido esto orque se usa exogenoas del futuro
        La mascara se hace despues
        """
        last_outsample_ds = self.max_len - offset + output_size
        first_ds = max(last_outsample_ds - window_sampling_limit - output_size, 0)
        if ts_idxs is None:
            filtered_ts_tensor = self.ts_tensor[:, :, first_ds:last_outsample_ds]
        else:
            filtered_ts_tensor = self.ts_tensor[ts_idxs, :, first_ds:last_outsample_ds]
        right_padding = max(last_outsample_ds - self.max_len, 0) #To padd with zeros if there is "nothing" to the right

        #assert np.sum(np.isnan(filtered_ts_tensor))<1.0, \
        #    f'The balanced balanced filtered_tensor has {np.sum(np.isnan(filtered_ts_tensor))} nan values'
        return filtered_ts_tensor, right_padding #ANTES, ts_train_mask

    def get_f_idxs(self, cols):
        # Check if cols are available f_cols and return the idxs
        assert all(col in self.f_cols for col in cols), f'Some variables in {cols} are not available in f_cols.'
        f_idxs = [self.X_cols.index(col) for col in cols]
        return f_idxs


# MASK example and test

In [None]:
from nixtla.data.datasets.epf import EPF, EPFInfo

def get_last_n_hours_mask_df(Y_df, n_hours):
    # Creates outsample_mask
    # train 1 validation 0

    last_df = Y_df.copy()[['unique_id', 'ds']]
    last_df.sort_values(by=['unique_id', 'ds'], inplace=True, ascending=False)
    last_df.reset_index(drop=True, inplace=True)

    last_df = last_df.groupby('unique_id').head(n_hours)
    last_df['mask'] = 1

    last_df = last_df[['unique_id', 'ds', 'mask']]

    mask_df = Y_df.merge(last_df, on=['unique_id', 'ds'], how='left')
    mask_df['mask'] = mask_df['mask'].fillna(0)    # The first len(Y)-n_hours used as train

    mask_df = mask_df[['unique_id', 'ds', 'mask']]
    mask_df.sort_values(by=['unique_id', 'ds'], inplace=True)

    assert len(mask_df)==len(Y_df), \
        f'The mask_df length {len(mask_df)} is not equal to Y_df length {len(Y_df)}'

    return mask_df

val_ds = 2 * 365
args = pd.Series({'dataset': ['NP', 'PJM', 'BE', 'FR']})

Y_df, Xt_df, S_df = EPF.load_groups(directory='data', groups=args.dataset)

mask_df = get_last_n_hours_mask_df(Y_df, n_hours=val_ds*24)
mask_df['available_mask'] = np.ones(len(mask_df))
mask_df['sample_mask'] = (1-mask_df['mask'].values)
mask = mask_df['sample_mask'].values

#print(f'Dataset: {args.dataset}')
#print("Xt_df.columns", Xt_df.columns)
print('X: time series features, of shape (#hours, #times,#features): \t' + str(Xt_df.shape))
if S_df is not None:
    print('S: static features, of shape (#series,#features+unique_id): \t' + str(S_df.shape))
print('Y: target series (in X), of shape (#hours, #times): \t \t' + str(Y_df.shape))
print("\n")

print("Train Validation splits")
print(mask_df.groupby(['mask', 'unique_id']).ds.max())
print(mask_df.groupby(['mask', 'unique_id']).ds.min())
print(f'Train insample percentage {np.round(sum(mask)/len(Y_df),2)}, \
        {sum(mask)} hours = {np.round(sum(mask)/(24*365),2)} years')
print(f'Train outsample percentage {np.round(sum(1-mask)/len(Y_df),2)}, \
        {sum(1-mask)} hours = {np.round(sum(1-mask)/(24*365),2)} years')
#Y_df.head()
print('\n')

dataset = TimeSeriesDataset(Y_df=Y_df, S_df=S_df, X_df=Xt_df, mask_df=mask_df)

In [None]:
dataset.t_cols

In [None]:
dataset.ts_tensor[0, dataset.t_cols.index('sample_mask'), :]

In [None]:
# Checking the insample_mask of series 0
dataset.ts_tensor[0, dataset.t_cols.index('sample_mask'), :]

In [None]:
dataset.ts_tensor[0, dataset.t_cols.index('y'), :]

In [None]:
print("dataset.ts_tensor.shape", dataset.ts_tensor.shape)

In [None]:
dataset.get_filtered_ts_tensor(offset=10, output_size=12, window_sampling_limit=36)