In [59]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from pygrinder import mcar, masked_fill
from pypots.imputation import SAITS, CSDI
from pypots.data import load_specific_dataset
from pypots.utils.metrics import cal_mae
import pandas as pd

In [38]:
data = pd.read_csv('data/vazoes_CA_20_23.csv')

In [82]:
Y = data['Vazao1_CA_30d']

In [40]:
type(Y)

numpy.ndarray

In [47]:
Y

array([ 335.,  329.,  278., ..., 2036., 1004.,  593.])

In [48]:
missing_mask = [1 if y_ is np.nan else 0 for y_ in Y]

In [53]:
missing_mask = np.array(missing_mask)

In [54]:
y = masked_fill(Y, 1 - missing_mask, np.nan)

In [55]:
dataset = {"X": y}

In [56]:
dataset

{'X': array([nan, nan, nan, ..., nan, nan, nan])}

In [83]:
array_univariado = Y.values.reshape(1, len(Y), 1)

In [84]:
saits = SAITS(n_steps=48, n_features=0, n_layers=1, d_model=256, d_inner=128, n_heads=4, d_k=64, d_v=64, dropout=0.1, epochs=10)

2023-11-03 19:59:52 [INFO]: No given device, using default device: cpu
2023-11-03 19:59:52 [INFO]: Model initialized successfully with the number of trainable parameters: 658,688


In [86]:
array_univariado

array([[[ 335.],
        [ 329.],
        [ 278.],
        ...,
        [2036.],
        [1004.],
        [ 593.]]])

In [87]:
saits.fit({"X": array_univariado})

2023-11-03 20:00:36 [ERROR]: Exception: mat1 and mat2 shapes cannot be multiplied (8633x2 and 0x256)


RuntimeError: Training got interrupted. Model was not trained. Please investigate the error printed above.

In [32]:
data = load_specific_dataset('physionet_2012')  # PyPOTS will automatically download and extract it.
X = data['X']
num_samples = len(X['RecordID'].unique())
X = X.drop(['RecordID', 'Time'], axis = 1)
X = StandardScaler().fit_transform(X.to_numpy())

2023-11-03 19:08:23 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Database)...
2023-11-03 19:08:23 [INFO]: Starting preprocessing physionet_2012...
2023-11-03 19:08:23 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2023-11-03 19:08:23 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2023-11-03 19:08:23 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2023-11-03 19:08:24 [INFO]: Loaded successfully!


In [33]:
X = X.reshape(num_samples, 48, -1)
X_intact, X, missing_mask, indicating_mask = mcar(X, 0.1) # hold out 10% observed values as ground truth
X = masked_fill(X, 1 - missing_mask, np.nan)

In [36]:
1 - missing_mask,

(array([[[1., 1., 1., ..., 1., 0., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]],
 
        [[1., 1., 1., ..., 1., 0., 1.],
         [1., 1., 1., ..., 0., 1., 0.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 0., 0.],
         [1., 1., 1., ..., 1., 0., 1.],
         [1., 1., 1., ..., 1., 0., 1.]],
 
        [[1., 1., 1., ..., 1., 0., 1.],
         [0., 0., 0., ..., 0., 1., 1.],
         [1., 1., 1., ..., 1., 0., 1.],
         ...,
         [1., 1., 1., ..., 1., 0., 1.],
         [1., 1., 1., ..., 1., 0., 1.],
         [1., 1., 1., ..., 1., 0., 1.]],
 
        ...,
 
        [[1., 1., 1., ..., 1., 0., 0.],
         [1., 1., 1., ..., 0., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 0., 1.],
         [1., 1., 1., ..., 1., 1., 1.],


In [46]:
len(X)

11988

In [61]:
data.columns

Index(['Unnamed: 0', 'Data', 'Vazao_CA', 'Vazao1_CA_1d', 'Vazao2_CA_1d',
       'Vazao1_CA_7d', 'Vazao2_CA_7d', 'Vazao1_CA_15d', 'Vazao2_CA_15d',
       'Vazao1_CA_30d', 'Vazao2_CA_30d'],
      dtype='object')

In [65]:
V = data[['Vazao_CA', 'Vazao1_CA_1d']]

In [66]:
len(V)

8633

In [64]:
from pypots.optim import Adam
from pypots.imputation import MRNN
from pypots.utils.metrics import cal_mae

# initialize the model
# initialize the model
mrnn = MRNN(
    n_steps=len(V),
    n_features=1,
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # Set it to None to use the default device (will use CPU if you don't have CUDA devices).
    # You can also set it to 'cpu' or 'cuda' explicitly, or ['cuda:0', 'cuda:1'] if you have multiple CUDA devices.
    device=None,
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/mrnn",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

2023-11-03 19:29:50 [INFO]: No given device, using default device: cpu
2023-11-03 19:29:50 [INFO]: Model files will be saved to tutorial_results/imputation/mrnn/20231103_T192950
2023-11-03 19:29:50 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/mrnn/20231103_T192950/tensorboard
2023-11-03 19:29:50 [INFO]: Model initialized successfully with the number of trainable parameters: 168,199


In [68]:
dataset_for_training = V[:7000]

In [69]:
dataset_for_validating = V[7000:]

In [71]:
dataset_for_training

Unnamed: 0,Vazao_CA,Vazao1_CA_1d
0,335,335.0
1,329,329.0
2,278,278.0
3,250,250.0
4,183,183.0
...,...,...
6995,67,67.0
6996,347,347.0
6997,390,390.0
6998,333,333.0


In [70]:
mrnn.fit(train_set=dataset_for_training, val_set=dataset_for_validating)

KeyError: 'X'

In [73]:
from pypots.data.generating import gene_physionet2012
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0.1)

2023-11-03 19:33:27 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Database)...
2023-11-03 19:33:27 [INFO]: Starting preprocessing physionet_2012...
2023-11-03 19:33:27 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2023-11-03 19:33:27 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2023-11-03 19:33:27 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2023-11-03 19:33:27 [INFO]: Loaded successfully!


In [75]:
len(physionet2012_dataset['train_X'])

7672

In [79]:
physionet2012_dataset['val_X_intact'].shape

(1918, 48, 37)

In [80]:
physionet2012_dataset['train_X'].shape

(7672, 48, 37)