# ðŸ“‘ Tutorials for PyPOTS Clustering Models

## ðŸ“€ Preparing the **PhysioNet-2012** dataset for this tutorial

In [1]:
from pypots.data.generating import gene_physionet2012

# Load the PhysioNet-2012 dataset, disable artificially-missing values for evaluation
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0)

# Take a look at the generated PhysioNet-2012 dataset, you'll find that everything has been prepared for you,
# data splitting, normalization, additional artificially-missing values for evaluation, etc.
print(physionet2012_dataset.keys())


  from .autonotebook import tqdm as notebook_tqdm
2023-05-17 00:01:10 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Database)...
2023-05-17 00:01:10 [INFO]: Starting preprocessing physionet_2012...


Dataset physionet_2012 has already been downloaded. Processing directly...
Dataset physionet_2012 has already been cached. Loading from cache directly...
Loaded successfully!
dict_keys(['n_classes', 'n_steps', 'n_features', 'train_X', 'train_y', 'val_X', 'val_y', 'test_X', 'test_y', 'scaler'])


In [2]:
# Assemble the datasets for training, validating, and testing.
import numpy as np

# don't need validation set
dataset_for_training = {
    "X": np.concatenate([physionet2012_dataset['train_X'], physionet2012_dataset['val_X']], axis=0),
    "y": np.concatenate([physionet2012_dataset['train_y'], physionet2012_dataset['val_y']], axis=0),
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
    "y": physionet2012_dataset['test_y'],
}


## ðŸš€ An exmaple of **CRLI** for clustering

In [3]:
from pypots.optim import Adam
from pypots.clustering import CRLI

# initialize the model
crli = CRLI(
    n_steps=physionet2012_dataset["n_steps"],
    n_features=physionet2012_dataset["n_features"],
    n_clusters=physionet2012_dataset["n_classes"],
    n_generator_layers=2,
    rnn_hidden_size=256,
    rnn_cell_type="GRU",
    decoder_fcn_output_dims=[256, 128],  # the output dimensions of layers in the decoder FCN.
    # Here means there are 3 layers. Leave it to default as None will results in
    # the FCN haveing only one layer.
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default, PyPOTS will automatically assign the best device for you.
    # Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
    device='cpu',  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/clustering/crli",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2023-05-17 00:01:22 [INFO]: the trained model will be saved to tutorial_results/clustering/crli/20230517_T000122
2023-05-17 00:01:22 [INFO]: the tensorboard file will be saved to tutorial_results/clustering/crli/20230517_T000122/tensorboard
2023-05-17 00:01:22 [INFO]: Model initialized successfully with the number of trainable parameters: 1,546,820


In [4]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
crli.fit(train_set=dataset_for_training)


2023-05-17 00:03:22 [INFO]: epoch 0: training loss_generator 3.7100, train loss_discriminator 0.3918
2023-05-17 00:05:23 [INFO]: epoch 1: training loss_generator 4.8573, train loss_discriminator 0.3700
2023-05-17 00:07:26 [INFO]: epoch 2: training loss_generator 4.6205, train loss_discriminator 0.3625
2023-05-17 00:09:16 [INFO]: epoch 3: training loss_generator 4.4745, train loss_discriminator 0.3566
2023-05-17 00:09:16 [INFO]: Exceeded the training patience. Terminating the training procedure...
2023-05-17 00:09:16 [INFO]: Finished training.
2023-05-17 00:09:16 [INFO]: Saved the model to tutorial_results/clustering/crli/20230517_T000122/CRLI.pypots.


In [5]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
crli_prediction = crli.cluster(dataset_for_testing)


In [6]:
from pypots.utils.metrics import cal_rand_index, cal_cluster_purity

# calculate mean absolute error on the ground truth (artificially-missing values)
RI = cal_rand_index(crli_prediction, dataset_for_testing["y"])
CP = cal_cluster_purity(crli_prediction, dataset_for_testing["y"])

print("Testing clustering metrics: \n"
      f'RI: {RI}, \n'
      f'CP: {CP}\n'
      )


Testing clustering metrics: 
RI: 0.7470594150388848, 
CP: 0.8519599666388658



## ðŸš€ An exmaple of **VaDER** for clustering

In [7]:
from pypots.optim import Adam
from pypots.clustering import VaDER

# initialize the model
vader = VaDER(
    n_steps=physionet2012_dataset["n_steps"],
    n_features=physionet2012_dataset["n_features"],
    n_clusters=physionet2012_dataset["n_classes"],
    rnn_hidden_size=128,
    d_mu_stddev=2,
    pretrain_epochs=20,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default, PyPOTS will automatically assign the best device for you.
    # Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
    device='cpu',  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/clustering/vader",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2023-05-17 00:09:21 [INFO]: the trained model will be saved to tutorial_results/clustering/vader/20230517_T000921
2023-05-17 00:09:21 [INFO]: the tensorboard file will be saved to tutorial_results/clustering/vader/20230517_T000921/tensorboard
2023-05-17 00:09:21 [INFO]: Model initialized successfully with the number of trainable parameters: 293,642


In [8]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
vader.fit(train_set=dataset_for_training)


2023-05-17 00:16:40 [INFO]: epoch 0: training loss 1.1245
2023-05-17 00:16:56 [INFO]: epoch 1: training loss 0.7835
2023-05-17 00:17:12 [INFO]: epoch 2: training loss 0.7712
2023-05-17 00:17:28 [INFO]: epoch 3: training loss 0.7709
2023-05-17 00:17:43 [INFO]: epoch 4: training loss 0.7951
2023-05-17 00:17:58 [INFO]: epoch 5: training loss 0.7826
2023-05-17 00:18:14 [INFO]: epoch 6: training loss nan
2023-05-17 00:18:14 [INFO]: Exceeded the training patience. Terminating the training procedure...
2023-05-17 00:18:14 [INFO]: Finished training.
2023-05-17 00:18:14 [INFO]: Saved the model to tutorial_results/clustering/vader/20230517_T000921/VaDER.pypots.


In [9]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
vader_prediction = vader.cluster(dataset_for_testing)


In [10]:
from pypots.utils.metrics import cal_rand_index, cal_cluster_purity

# calculate mean absolute error on the ground truth (artificially-missing values)
RI = cal_rand_index(vader_prediction, dataset_for_testing["y"])
CP = cal_cluster_purity(vader_prediction, dataset_for_testing["y"])

print("Testing clustering metrics: \n"
      f'RI: {RI}, \n'
      f'CP: {CP},\n'
      )


Testing clustering metrics: 
RI: 0.7476464012041741, 
CP: 0.8519599666388658,

