<a href="https://colab.research.google.com/github/Squarkk/ECG-classification-solution/blob/main/time_series_clsf_autoencoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install hdbscan

Collecting hdbscan
  Downloading hdbscan-0.8.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cython<3,>=0.27 (from hdbscan)
  Downloading Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cython, hdbscan
  Attempting uninstall: cython
    Found existing installation: Cython 3.0.10
    Uninstalling Cython-3.0.10:
      Successfully uninstalled Cython-3.0.10
Successfully installed cython-0.29.37 hdbscan-0.8.36


# Utils

In [2]:
import requests
import zipfile
from io import BytesIO
import pandas as pd

import numpy as np
import torch
import random
import json

#### UTILS.PY

def download_and_unzip(url="https://www.cs.ucr.edu/~eamonn/time_series_data/UCR_TS_Archive_2015.zip", password="attempttoclassify"):
    # Create the INPUT directory if it does not exist
    input_dir = 'INPUT'
    if not os.path.exists(input_dir):
        os.makedirs(input_dir)

    # Download the zip file
    response = requests.get(url)
    if response.status_code == 200:
        # Unzip the file into the INPUT directory
        with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
            if password:
                zip_ref.setpassword(password.encode('utf-8'))
            zip_ref.extractall(input_dir)
        print(f"Files have been successfully unzipped into the '{input_dir}' folder.")
    else:
        print(f"Failed to download the file. Status code: {response.status_code}")


def open_data(direc, ratio_train=0.8, dataset="ECG5000"):
    """Input:
    direc: location of the UCR archive
    ratio_train: ratio to split training and testset
    dataset: name of the dataset in the UCR archive"""
    datadir = direc + '/' + dataset + '/' + dataset
    data_train = np.loadtxt(datadir + '_TRAIN', delimiter=',')
    data_test_val = np.loadtxt(datadir + '_TEST', delimiter=',')[:-1]
    data = np.concatenate((data_train, data_test_val), axis=0)
    data = np.expand_dims(data, -1)

    N, D, _ = data.shape

    ind_cut = int(ratio_train * N)
    ind = np.random.permutation(N)
    return data[ind[:ind_cut], 1:, :], data[ind[ind_cut:], 1:, :], data[ind[:ind_cut], 0, :], data[ind[ind_cut:], 0, :]

# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False




# Architecture

In [3]:
import os
from sklearn.base import BaseEstimator as SklearnBaseEstimator
import numpy as np
import torch
from torch import nn, optim
from torch import distributions
from torch.utils.data import DataLoader,TensorDataset
from torch.autograd import Variable


###### VRAE ARCHITECTURE
class BaseEstimator(SklearnBaseEstimator):
    # http://msmbuilder.org/development/apipatterns.html

    def summarize(self):
        return 'NotImplemented'


class Encoder(nn.Module):
    def __init__(self, number_of_features, hidden_size, hidden_layer_depth, latent_length, dropout, block='LSTM'):
        super(Encoder, self).__init__()

        self.number_of_features = number_of_features
        self.hidden_size = hidden_size
        self.hidden_layer_depth = hidden_layer_depth
        self.latent_length = latent_length

        if block == 'LSTM':
            self.model = nn.LSTM(self.number_of_features, self.hidden_size, self.hidden_layer_depth, dropout=dropout)
        elif block == 'GRU':
            self.model = nn.GRU(self.number_of_features, self.hidden_size, self.hidden_layer_depth, dropout=dropout)
        else:
            raise NotImplementedError

        self.block = block

    def forward(self, x):
        if self.block == 'LSTM':
            _, (h_end, _) = self.model(x)
        elif self.block == 'GRU':
            _, h_end = self.model(x)

        h_end = h_end[-1, :, :]
        return h_end


class Lambda(nn.Module):
    """Lambda module converts output of encoder to latent vector

    :param hidden_size: hidden size of the encoder
    :param latent_length: latent vector length
    """
    def __init__(self, hidden_size, latent_length):
        super(Lambda, self).__init__()

        self.hidden_size = hidden_size
        self.latent_length = latent_length

        self.hidden_to_mean = nn.Linear(self.hidden_size, self.latent_length)
        self.hidden_to_logvar = nn.Linear(self.hidden_size, self.latent_length)

        nn.init.xavier_uniform_(self.hidden_to_mean.weight)
        nn.init.xavier_uniform_(self.hidden_to_logvar.weight)

    def forward(self, cell_output):
        """Given last hidden state of encoder, passes through a linear layer, and finds the mean and variance

        :param cell_output: last hidden state of encoder
        :return: latent vector
        """

        self.latent_mean = self.hidden_to_mean(cell_output)
        self.latent_logvar = self.hidden_to_logvar(cell_output)

        if self.training:
            std = torch.exp(0.5 * self.latent_logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(self.latent_mean)
        else:
            return self.latent_mean

class Decoder(nn.Module):
    """Converts latent vector into output

    :param sequence_length: length of the input sequence
    :param batch_size: batch size of the input sequence
    :param hidden_size: hidden size of the RNN
    :param hidden_layer_depth: number of layers in RNN
    :param latent_length: latent vector length
    :param output_size: 2, one representing the mean, other log std dev of the output
    :param block: GRU/LSTM - use the same which you've used in the encoder
    :param dtype: Depending on cuda enabled/disabled, create the tensor
    """
    def __init__(self, sequence_length, batch_size, hidden_size, hidden_layer_depth, latent_length, output_size, dtype, block='LSTM'):

        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.hidden_layer_depth = hidden_layer_depth
        self.latent_length = latent_length
        self.output_size = output_size
        self.dtype = dtype

        if block == 'LSTM':
            self.model = nn.LSTM(1, self.hidden_size, self.hidden_layer_depth)
        elif block == 'GRU':
            self.model = nn.GRU(1, self.hidden_size, self.hidden_layer_depth)
        else:
            raise NotImplementedError

        self.latent_to_hidden = nn.Linear(self.latent_length, self.hidden_size)
        self.hidden_to_output = nn.Linear(self.hidden_size, self.output_size)

        self.decoder_inputs = torch.zeros(self.sequence_length, self.batch_size, 1, requires_grad=True).type(self.dtype)
        self.c_0 = torch.zeros(self.hidden_layer_depth, self.batch_size, self.hidden_size, requires_grad=True).type(self.dtype)

        nn.init.xavier_uniform_(self.latent_to_hidden.weight)
        nn.init.xavier_uniform_(self.hidden_to_output.weight)

    def forward(self, latent):
        """Converts latent to hidden to output

        :param latent: latent vector
        :return: outputs consisting of mean and std dev of vector
        """
        h_state = self.latent_to_hidden(latent)

        if isinstance(self.model, nn.LSTM):
            h_0 = torch.stack([h_state for _ in range(self.hidden_layer_depth)])
            decoder_output, _ = self.model(self.decoder_inputs, (h_0, self.c_0))
        elif isinstance(self.model, nn.GRU):
            h_0 = torch.stack([h_state for _ in range(self.hidden_layer_depth)])
            decoder_output, _ = self.model(self.decoder_inputs, h_0)
        else:
            raise NotImplementedError

        out = self.hidden_to_output(decoder_output)
        return out

def _assert_no_grad(tensor):
    assert not tensor.requires_grad, \
        "nn criterions don't compute the gradient w.r.t. targets - please " \
        "mark these tensors as not requiring gradients"

class VRAE(BaseEstimator, nn.Module):
    """Variational recurrent auto-encoder. This module is used for dimensionality reduction of timeseries

    :param sequence_length: length of the input sequence
    :param number_of_features: number of input features
    :param hidden_size:  hidden size of the RNN
    :param hidden_layer_depth: number of layers in RNN
    :param latent_length: latent vector length
    :param batch_size: number of timeseries in a single batch
    :param learning_rate: the learning rate of the module
    :param block: GRU/LSTM to be used as a basic building block
    :param n_epochs: Number of iterations/epochs
    :param dropout_rate: The probability of a node being dropped-out
    :param optimizer: ADAM/ SGD optimizer to reduce the loss function
    :param loss: SmoothL1Loss / MSELoss / ReconLoss / any custom loss which inherits from `_Loss` class
    :param boolean cuda: to be run on GPU or not
    :param print_every: The number of iterations after which loss should be printed
    :param boolean clip: Gradient clipping to overcome explosion
    :param max_grad_norm: The grad-norm to be clipped
    :param dload: Download directory where models are to be dumped
    """
    def __init__(self, sequence_length, number_of_features, hidden_size=90, hidden_layer_depth=2, latent_length=20,
                 batch_size=32, learning_rate=0.005, block='LSTM',
                 n_epochs=5, dropout_rate=0., optimizer='Adam', loss='MSELoss',
                 cuda=False, print_every=100, clip=True, max_grad_norm=5, dload='.'):

        super(VRAE, self).__init__()


        self.dtype = torch.FloatTensor
        self.use_cuda = cuda

        if not torch.cuda.is_available() and self.use_cuda:
            self.use_cuda = False


        if self.use_cuda:
            self.dtype = torch.cuda.FloatTensor


        self.encoder = Encoder(number_of_features = number_of_features,
                               hidden_size=hidden_size,
                               hidden_layer_depth=hidden_layer_depth,
                               latent_length=latent_length,
                               dropout=dropout_rate,
                               block=block)

        self.lmbd = Lambda(hidden_size=hidden_size,
                           latent_length=latent_length)

        self.decoder = Decoder(sequence_length=sequence_length,
                               batch_size = batch_size,
                               hidden_size=hidden_size,
                               hidden_layer_depth=hidden_layer_depth,
                               latent_length=latent_length,
                               output_size=number_of_features,
                               block=block,
                               dtype=self.dtype)

        self.sequence_length = sequence_length
        self.hidden_size = hidden_size
        self.hidden_layer_depth = hidden_layer_depth
        self.latent_length = latent_length
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs

        self.print_every = print_every
        self.clip = clip
        self.max_grad_norm = max_grad_norm
        self.is_fitted = False
        self.dload = dload

        if self.use_cuda:
            self.cuda()

        if optimizer == 'Adam':
            self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        elif optimizer == 'SGD':
            self.optimizer = optim.SGD(self.parameters(), lr=learning_rate)
        else:
            raise ValueError('Not a recognized optimizer')

        if loss == 'SmoothL1Loss':
            self.loss_fn = nn.SmoothL1Loss(size_average=False)
        elif loss == 'MSELoss':
            self.loss_fn = nn.MSELoss(size_average=False)

    def __repr__(self):
        return """VRAE(n_epochs={n_epochs},batch_size={batch_size},cuda={cuda})""".format(
                n_epochs=self.n_epochs,
                batch_size=self.batch_size,
                cuda=self.use_cuda)

    def forward(self, x):
        """
        Forward propagation which involves one pass from inputs to encoder to lambda to decoder

        :param x:input tensor
        :return: the decoded output, latent vector
        """
        cell_output = self.encoder(x)
        latent = self.lmbd(cell_output)
        x_decoded = self.decoder(latent)

        return x_decoded, latent

    def _rec(self, x_decoded, x, loss_fn):
        """
        Compute the loss given output x decoded, input x and the specified loss function

        :param x_decoded: output of the decoder
        :param x: input to the encoder
        :param loss_fn: loss function specified
        :return: joint loss, reconstruction loss and kl-divergence loss
        """
        latent_mean, latent_logvar = self.lmbd.latent_mean, self.lmbd.latent_logvar

        kl_loss = -0.5 * torch.mean(1 + latent_logvar - latent_mean.pow(2) - latent_logvar.exp())
        recon_loss = loss_fn(x_decoded, x)

        return kl_loss + recon_loss, recon_loss, kl_loss

    def compute_loss(self, X):
        """
        Given input tensor, forward propagate, compute the loss, and backward propagate.
        Represents the lifecycle of a single iteration

        :param X: Input tensor
        :return: total loss, reconstruction loss, kl-divergence loss and original input
        """
        x = Variable(X[:,:,:].type(self.dtype), requires_grad = True)

        x_decoded, _ = self(x)
        loss, recon_loss, kl_loss = self._rec(x_decoded, x.detach(), self.loss_fn)

        return loss, recon_loss, kl_loss, x


    def _train(self, train_loader):
        """
        For each epoch, given the batch_size, run this function batch_size * num_of_batches number of times

        :param train_loader:input train loader with shuffle
        :return:
        """
        self.train()

        epoch_loss = 0
        t = 0

        for t, X in enumerate(train_loader):

            # Index first element of array to return tensor
            X = X[0]

            # required to swap axes, since dataloader gives output in (batch_size x seq_len x num_of_features)
            X = X.permute(1,0,2)

            self.optimizer.zero_grad()
            loss, recon_loss, kl_loss, _ = self.compute_loss(X)
            loss.backward()

            if self.clip:
                torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm = self.max_grad_norm)

            # accumulator
            epoch_loss += loss.item()

            self.optimizer.step()

            if (t + 1) % self.print_every == 0:
                print('Batch %d, loss = %.4f, recon_loss = %.4f, kl_loss = %.4f' % (t + 1, loss.item(),
                                                                                    recon_loss.item(), kl_loss.item()))

        print('Average loss: {:.4f}'.format(epoch_loss / t))


    def fit(self, dataset, save = False):
        """
        Calls `_train` function over a fixed number of epochs, specified by `n_epochs`

        :param dataset: `Dataset` object
        :param bool save: If true, dumps the trained model parameters as pickle file at `dload` directory
        :return:
        """

        train_loader = DataLoader(dataset = dataset,
                                  batch_size = self.batch_size,
                                  shuffle = True,
                                  drop_last=True,
                                  worker_init_fn=np.random.seed(12))

        for i in range(self.n_epochs):
            print('Epoch: %s' % i)

            self._train(train_loader)

        self.is_fitted = True
        if save:
            self.save('model.pth')


    def _batch_transform(self, x):
        """
        Passes the given input tensor into encoder and lambda function

        :param x: input batch tensor
        :return: intermediate latent vector
        """
        return self.lmbd(
                    self.encoder(
                        Variable(x.type(self.dtype), requires_grad = False)
                    )
        ).cpu().data.numpy()

    def _batch_reconstruct(self, x):
        """
        Passes the given input tensor into encoder, lambda and decoder function

        :param x: input batch tensor
        :return: reconstructed output tensor
        """

        x = Variable(x.type(self.dtype), requires_grad = False)
        x_decoded, _ = self(x)

        return x_decoded.cpu().data.numpy()

    def reconstruct(self, dataset, save = False):
        """
        Given input dataset, creates dataloader, runs dataloader on `_batch_reconstruct`
        Prerequisite is that model has to be fit

        :param dataset: input dataset who's output vectors are to be obtained
        :param bool save: If true, dumps the output vector dataframe as a pickle file
        :return:
        """

        self.eval()

        test_loader = DataLoader(dataset = dataset,
                                 batch_size = self.batch_size,
                                 shuffle = False,
                                 drop_last=True) # Don't shuffle for test_loader

        if self.is_fitted:
            with torch.no_grad():
                x_decoded = []

                for t, x in enumerate(test_loader):
                    x = x[0]
                    x = x.permute(1, 0, 2)

                    x_decoded_each = self._batch_reconstruct(x)
                    x_decoded.append(x_decoded_each)

                x_decoded = np.concatenate(x_decoded, axis=1)

                if save:
                    if os.path.exists(self.dload):
                        pass
                    else:
                        os.mkdir(self.dload)
                    x_decoded.dump(self.dload + '/z_run.pkl')
                return x_decoded

        raise RuntimeError('Model needs to be fit')


    def transform(self, dataset, save = False):
        """
        Given input dataset, creates dataloader, runs dataloader on `_batch_transform`
        Prerequisite is that model has to be fit

        :param dataset: input dataset who's latent vectors are to be obtained
        :param bool save: If true, dumps the latent vector dataframe as a pickle file
        :return:
        """
        self.eval()

        test_loader = DataLoader(dataset = dataset,
                                 batch_size = self.batch_size,
                                 shuffle = False,
                                 drop_last=True) # Don't shuffle for test_loader
        if self.is_fitted:
            with torch.no_grad():
                z_run = []

                for t, x in enumerate(test_loader):
                    x = x[0]
                    x = x.permute(1, 0, 2)

                    z_run_each = self._batch_transform(x)
                    z_run.append(z_run_each)

                z_run = np.concatenate(z_run, axis=0)
                if save:
                    if os.path.exists(self.dload):
                        pass
                    else:
                        os.mkdir(self.dload)
                    z_run.dump(self.dload + '/z_run.pkl')
                return z_run

        raise RuntimeError('Model needs to be fit')

    def fit_transform(self, dataset, save = False):
        """
        Combines the `fit` and `transform` functions above

        :param dataset: Dataset on which fit and transform have to be performed
        :param bool save: If true, dumps the model and latent vectors as pickle file
        :return: latent vectors for input dataset
        """
        self.fit(dataset, save = save)
        return self.transform(dataset, save = save)

    def compute_dataset_loss(self, dataset):
        """
        Computes the average reconstruction loss and KL divergence loss for the given dataset.

        :param dataset: input dataset
        :return: average reconstruction loss and average KL divergence loss
        """
        self.eval()
        test_loader = DataLoader(dataset=dataset,
                                 batch_size=self.batch_size,
                                 shuffle=False,
                                 drop_last=True)
        total_recon_loss = 0
        total_kl_loss = 0
        total_batches = 0

        with torch.no_grad():
            for t, x in enumerate(test_loader):
                x = x[0]
                x = x.permute(1, 0, 2)
                _, recon_loss, kl_loss, _ = self.compute_loss(x)
                total_recon_loss += recon_loss.item()
                total_kl_loss += kl_loss.item()
                total_batches += 1

        avg_recon_loss = total_recon_loss / total_batches
        avg_kl_loss = total_kl_loss / total_batches

        return avg_recon_loss, avg_kl_loss

    def save(self, file_name):
        """
        Pickles the model parameters to be retrieved later

        :param file_name: the filename to be saved as,`dload` serves as the download directory
        :return: None
        """
        PATH = self.dload + '/' + file_name
        if os.path.exists(self.dload):
            pass
        else:
            os.mkdir(self.dload)
        torch.save(self.state_dict(), PATH)

    def load(self, PATH):
        """
        Loads the model's parameters from the path mentioned

        :param PATH: Should contain pickle file
        :return: None
        """
        self.is_fitted = True
        self.load_state_dict(torch.load(PATH))

    def predict(self, df):
        """
        Predict the latent vector for a single row pandas DataFrame.

        :param df: Pandas DataFrame containing a single row of data
        :return: latent vector as a numpy array
        """
        if df.shape[0] != 1:
            raise ValueError("DataFrame must contain exactly one row.")

        # Drop the first column and convert the DataFrame row to a numpy array
        row_array = df.drop(columns=[0]).to_numpy()

        # Convert the numpy array to a PyTorch tensor
        row_tensor = torch.from_numpy(row_array).float().unsqueeze(2)  # Add feature dimension

        # Add batch dimension
        row_tensor = row_tensor.permute(1, 0, 2)  # Change to (sequence_length, batch_size, number_of_features)

        # If CUDA is enabled, move the tensor to GPU
        if self.use_cuda:
            row_tensor = row_tensor.cuda()

        # Forward pass through the VRAE model
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            latent_vector = self._batch_transform(row_tensor)

        return latent_vector

    def predict_json(self, json_input):
        """
        Predict the latent vector for a single row JSON input.

        :param json_input: JSON string containing a single row of data
        :return: latent vector as a numpy array
        """
        # Parse JSON string to DataFrame
        df = pd.DataFrame.from_records(json.loads(json_input))

        # Ensure the DataFrame contains exactly one row
        if df.shape[0] != 1:
            raise ValueError("Input JSON must contain exactly one row.")

        # Drop the first column and convert the DataFrame row to a numpy array
        row_array = df.drop(columns='0').to_numpy()

        # Convert the numpy array to a PyTorch tensor
        row_tensor = torch.from_numpy(row_array).float().unsqueeze(2)  # Add feature dimension

        # Add batch dimension
        row_tensor = row_tensor.permute(1, 0, 2)  # Change to (sequence_length, batch_size, number_of_features)

        # If CUDA is enabled, move the tensor to GPU
        if self.use_cuda:
            row_tensor = row_tensor.cuda()

        # Forward pass through the VRAE model
        self.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            latent_vector = self._batch_transform(row_tensor)

        return latent_vector


# Train.py

In [4]:
import os
import uuid

import itertools
import numpy as np
import torch
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
import hdbscan

##### -------------------------- USAGES
os.environ['UUID'] = str(uuid.uuid4()).replace("-","")
os.environ['SEED'] = '777'
os.environ['EXPERIMENT_LOOP_FOLDER'] = os.path.join('/content/EXPERIMENTS',os.environ['UUID'])


if os.path.exists('/content/EXPERIMENTS'):
    pass
else:
    os.mkdir('/content/EXPERIMENTS')

os.mkdir(os.environ['EXPERIMENT_LOOP_FOLDER'])


# Download and unzip data (take about 3 minutes)
download_and_unzip()


set_seed(int(os.environ['SEED']))

# Split data into training and validation sets
X_train, X_val, y_train, y_val = open_data('/content/INPUT/UCR_TS_Archive_2015', ratio_train=0.8)

# Define hyperparameter grid
param_grid = {
    'sequence_length':[X_train.shape[1]],
    'number_of_features':[X_train.shape[2]],
    'hidden_size': [90],
    'hidden_layer_depth': [1],
    'latent_length': [20],
    'batch_size': [32],
    'learning_rate': [0.0005],
    'n_epochs': [30,60],
    'dropout_rate': [0.2],
    'optimizer': ['Adam'], #or 'SGD'
    'cuda' : [True],
    'print_every' : [30],
    'clip' : [True],
    'max_grad_norm': [5],
    'loss': ['MSELoss'], #'SmoothL1Loss'
    'block': ['LSTM'] #or 'GRU'
}

# Generate all combinations of hyperparameters
all_combinations = list(itertools.product(*param_grid.values()))
best_noise_ratio = float('-inf')
best_params = None

# Function to train VRAE, apply PCA, and DBSCAN, and evaluate clustering
def train_vrae(params):
    sequence_length,number_of_features,\
    hidden_size, hidden_layer_depth,\
    latent_length, batch_size,\
    learning_rate, n_epochs, dropout_rate,\
    optimizer,cuda,print_every,clip,max_grad_norm,\
    loss, block = params

    print(params)
    # Train VRAE model
    vrae = VRAE(sequence_length=sequence_length,
                number_of_features=number_of_features,
                hidden_size=hidden_size,
                hidden_layer_depth=hidden_layer_depth,
                latent_length=latent_length,
                batch_size=batch_size,
                learning_rate=learning_rate,
                n_epochs=n_epochs,
                dropout_rate=dropout_rate,
                optimizer=optimizer,
                cuda=cuda,
                print_every=print_every,
                clip=clip,
                max_grad_norm=max_grad_norm,
                loss=loss,
                block=block,
                dload=os.environ['EXPERIMENT_LOOP_FOLDER'])

    return vrae

def train_evaluate_clustering(vrae):

    vrae.fit(TensorDataset(torch.from_numpy(X_train[y_train.flatten() == 1])))

    recon_loss, kl_loss = vrae.compute_dataset_loss(TensorDataset(torch.from_numpy(X_val[y_val.flatten() == 1])))

    # FURTHER DIM REDUCTION
    latent_healty = vrae.transform(TensorDataset(torch.from_numpy(X_train[y_train.flatten()==1])))
    svd = TruncatedSVD(n_components=2,random_state=123).fit(latent_healty)
    latent_healty_svd = svd.transform(latent_healty)

    # Apply DBSCAN
    clusterer = hdbscan.HDBSCAN(prediction_data=True, min_samples=10)
    clusters = clusterer.fit_predict(latent_healty_svd)
    labels = clusterer.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    noise_ratio = n_noise / len(labels)

    return noise_ratio, recon_loss, kl_loss

# Grid search
for params in all_combinations:
    noise_ratio, recon_loss, kl_loss = train_evaluate_clustering(train_vrae(params))
    if noise_ratio > best_noise_ratio:
        best_noise_ratio = noise_ratio
        best_params = params

print(f'Best Combined Score: {best_noise_ratio}')
print(f'Best Hyperparameters: {best_params}')

vrae = train_vrae(best_params)
vrae.fit(TensorDataset(torch.from_numpy(X_train[y_train.flatten() == 1])))


# FURTHER DIM REDUCTION
latent_healty = vrae.transform(TensorDataset(torch.from_numpy(X_train[y_train.flatten() == 1])))

# TRAIN SVD ONLY ON TRAIN DATA
svd = TruncatedSVD(n_components=3, random_state=123).fit(latent_healty)
latent_healty_svd = svd.transform(latent_healty)

# ADD TRAIN UNHEALTHY
latent_train_unhealty = vrae.transform(TensorDataset(torch.from_numpy(X_train[y_train.flatten() != 1])))
latent_train_unhealty_svd = svd.transform(latent_train_unhealty)

# ADD HEALTHY FROM VALIDATION SET
latent_valid_healty = vrae.transform(TensorDataset(torch.from_numpy(X_val[y_val.flatten() == 1])))
latent_valid_healty_svd = svd.transform(latent_valid_healty)

# ADD UNHEALTHY FROM VALIDATION SET
latent_valid_unhealty = vrae.transform(TensorDataset(torch.from_numpy(X_val[y_val.flatten() != 1])))
latent_valid_unhealty_svd = svd.transform(latent_valid_unhealty)

# Apply DBSCAN
clusterer = hdbscan.HDBSCAN(prediction_data=True, min_samples=10)
clusters = clusterer.fit_predict(latent_healty_svd)
labels = clusterer.labels_


# CLUSTER EVALUATION
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
#KPI
noise_ratio = n_noise / len(labels)

print(f"Number of clusters: {n_clusters}")
print(f"Noise ratio: {noise_ratio}")

print(os.environ['UUID'])

Files have been successfully unzipped into the 'INPUT' folder.
(140, 1, 90, 1, 20, 32, 0.0005, 30, 0.2, 'Adam', True, 30, True, 5, 'MSELoss', 'LSTM')




Epoch: 0
Batch 30, loss = 4055.7051, recon_loss = 4055.6582, kl_loss = 0.0468
Batch 60, loss = 2038.4369, recon_loss = 2033.8771, kl_loss = 4.5598
Average loss: 3401.3130
Epoch: 1
Batch 30, loss = 2083.1902, recon_loss = 2077.6370, kl_loss = 5.5533
Batch 60, loss = 1746.0364, recon_loss = 1740.4573, kl_loss = 5.5790
Average loss: 1931.9436
Epoch: 2
Batch 30, loss = 1559.7002, recon_loss = 1554.4331, kl_loss = 5.2671
Batch 60, loss = 1403.4897, recon_loss = 1398.6985, kl_loss = 4.7913
Average loss: 1707.1731
Epoch: 3
Batch 30, loss = 1656.1226, recon_loss = 1651.5847, kl_loss = 4.5378
Batch 60, loss = 1877.6520, recon_loss = 1873.3578, kl_loss = 4.2942
Average loss: 1629.7482
Epoch: 4
Batch 30, loss = 1718.9705, recon_loss = 1714.9951, kl_loss = 3.9754
Batch 60, loss = 1482.0906, recon_loss = 1478.4287, kl_loss = 3.6619
Average loss: 1601.1946
Epoch: 5
Batch 30, loss = 1515.2368, recon_loss = 1511.9988, kl_loss = 3.2381
Batch 60, loss = 1594.7300, recon_loss = 1591.8906, kl_loss = 2.839



Batch 60, loss = 2033.2936, recon_loss = 2029.3923, kl_loss = 3.9013
Average loss: 3448.2799
Epoch: 1
Batch 30, loss = 2010.0781, recon_loss = 2004.8118, kl_loss = 5.2663
Batch 60, loss = 1760.4585, recon_loss = 1755.5424, kl_loss = 4.9161
Average loss: 1846.9415
Epoch: 2
Batch 30, loss = 1560.2384, recon_loss = 1555.9917, kl_loss = 4.2467
Batch 60, loss = 1331.6829, recon_loss = 1327.7070, kl_loss = 3.9759
Average loss: 1624.0957
Epoch: 3
Batch 30, loss = 1459.2152, recon_loss = 1455.7908, kl_loss = 3.4244
Batch 60, loss = 1486.1941, recon_loss = 1483.1606, kl_loss = 3.0334
Average loss: 1596.2249
Epoch: 4
Batch 30, loss = 1476.4209, recon_loss = 1473.8748, kl_loss = 2.5462
Batch 60, loss = 1486.2229, recon_loss = 1483.9346, kl_loss = 2.2883
Average loss: 1582.4164
Epoch: 5
Batch 30, loss = 1745.2595, recon_loss = 1743.1750, kl_loss = 2.0845
Batch 60, loss = 1642.7997, recon_loss = 1640.7635, kl_loss = 2.0362
Average loss: 1553.0227
Epoch: 6
Batch 30, loss = 1243.9642, recon_loss = 12



Batch 30, loss = 4165.3457, recon_loss = 4165.2773, kl_loss = 0.0685
Batch 60, loss = 2153.1968, recon_loss = 2148.8196, kl_loss = 4.3773
Average loss: 3490.9983
Epoch: 1
Batch 30, loss = 1851.2626, recon_loss = 1845.8481, kl_loss = 5.4144
Batch 60, loss = 1694.2561, recon_loss = 1689.0214, kl_loss = 5.2347
Average loss: 1835.1660
Epoch: 2
Batch 30, loss = 1586.2043, recon_loss = 1581.6733, kl_loss = 4.5310
Batch 60, loss = 1504.4850, recon_loss = 1500.1086, kl_loss = 4.3763
Average loss: 1642.3132
Epoch: 3
Batch 30, loss = 1449.2900, recon_loss = 1445.1907, kl_loss = 4.0993
Batch 60, loss = 1395.8617, recon_loss = 1391.9448, kl_loss = 3.9169
Average loss: 1630.3020
Epoch: 4
Batch 30, loss = 1425.4846, recon_loss = 1421.9858, kl_loss = 3.4987
Batch 60, loss = 1750.2561, recon_loss = 1746.9788, kl_loss = 3.2774
Average loss: 1600.8189
Epoch: 5
Batch 30, loss = 1964.8654, recon_loss = 1962.0940, kl_loss = 2.7714
Batch 60, loss = 1597.9631, recon_loss = 1595.5082, kl_loss = 2.4550
Average

In [5]:
import pickle
import joblib
vrae.save('vrae.pth')
joblib.dump(svd, os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'svd.joblib'))
joblib.dump(clusterer, os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'clusterer.joblib'))

# Save best_params
best_param_grid = {
    'sequence_length': best_params[0],
    'number_of_features': best_params[1],
    'hidden_size': best_params[2],
    'hidden_layer_depth': best_params[3],
    'latent_length': best_params[4],
    'batch_size': best_params[5],
    'learning_rate': best_params[6],
    'n_epochs': best_params[7],
    'dropout_rate': best_params[8],
    'optimizer': best_params[9],
    'cuda': best_params[10],
    'print_every': best_params[11],
    'clip': best_params[12],
    'max_grad_norm': best_params[13],
    'loss': best_params[14],
    'block': best_params[15]
}

with open(os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'best_params.pkl'), 'wb') as f:
    pickle.dump(best_param_grid, f)

with open(os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'seq_length.pkl'), 'wb') as f:
    pickle.dump(X_train.shape[2], f)


# Save latent_healty_svd
with open(os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'latent_healty_svd.pkl'), 'wb') as f:
    pickle.dump(latent_healty_svd, f)

# Save latent_train_unhealty_svd
with open(os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'latent_train_unhealty_svd.pkl'), 'wb') as f:
    pickle.dump(latent_train_unhealty_svd, f)

# Save latent_valid_unhealty_svd
with open(os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'latent_valid_unhealty_svd.pkl'), 'wb') as f:
    pickle.dump(latent_valid_unhealty_svd, f)

# Save labels
with open(os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], 'labels.pkl'), 'wb') as f:
    pickle.dump(labels, f)

# Write the UUID value to the text file
with open(os.path.join(os.environ['EXPERIMENT_LOOP_FOLDER'], os.environ['UUID'] + '.txt'), 'w') as f:
    f.write(os.environ['UUID'])

# Evaluate & Finalize

In [6]:
import os
import pickle
import plotly.graph_objs as go
import plotly.offline as pyo
import plotly.express as px
import numpy as np
import shutil

def evaluate(model_uuid,plots=True, threshold=0.2):
    experiment_repos = os.path.join('/content/EXPERIMENTS',model_uuid)

    # Define the base folder
    base_folder = os.environ['EXPERIMENT_LOOP_FOLDER']

    # Load best_params
    with open(os.path.join(base_folder, 'best_params.pkl'), 'rb') as f:
        best_params = pickle.load(f)

    # Load clusterer
    clusterer = joblib.load(os.path.join(base_folder, 'clusterer.joblib'))
    # Load clusterer
    svd = joblib.load(os.path.join(base_folder, 'svd.joblib'))

    # Load latent_healty_svd
    with open(os.path.join(base_folder, 'latent_healty_svd.pkl'), 'rb') as f:
        latent_healty_svd = pickle.load(f)

    # Load latent_train_unhealty_svd
    with open(os.path.join(base_folder, 'latent_train_unhealty_svd.pkl'), 'rb') as f:
        latent_train_unhealty_svd = pickle.load(f)

    # Load latent_valid_unhealty_svd
    with open(os.path.join(base_folder, 'latent_valid_unhealty_svd.pkl'), 'rb') as f:
        latent_valid_unhealty_svd = pickle.load(f)

    # Load labels
    with open(os.path.join(base_folder, 'labels.pkl'), 'rb') as f:
        labels = pickle.load(f)

    # Load the UUID value from the text file
    uuid_file = os.path.join(base_folder, os.environ['UUID'] + '.txt')
    with open(uuid_file, 'r') as f:
        uuid_value = f.read()

    # Print the loaded UUID value
    print("UUID Value:", uuid_value)

    if plots:
        #EVALUATING LATENT REPRESENTATION
        trace1 = go.Scatter(
            x=latent_healty_svd[:, 0],
            y=latent_healty_svd[:, 1],
            mode='markers',
            name='TRAIN Healthy'
        )

        trace2 = go.Scatter(
            x=latent_train_unhealty_svd[:, 0],
            y=latent_train_unhealty_svd[:, 1],
            mode='markers',
            name='TRAIN Unhealthy'
        )

        trace3 = go.Scatter(
            x=latent_valid_healty_svd[:, 0],
            y=latent_valid_healty_svd[:, 1],
            mode='markers',
            name='VALID Healthy'
        )

        trace4 = go.Scatter(
            x=latent_valid_unhealty_svd[:, 0],
            y=latent_valid_unhealty_svd[:, 1],
            mode='markers',
            name='VALID Unhealthy'
        )

        # Creare il layout
        layout = go.Layout(
            title='SVD',
            xaxis=dict(title='X-axis'),
            yaxis=dict(title='Y-axis')
        )

        # Creare la figura
        fig = go.Figure(data=[trace1, trace2, trace3, trace4], layout=layout)

        # Tracciare la figura
        pyo.iplot(fig)

        ### USING HDBSCAN

        # Creiamo i DataFrame con i risultati
        df1 = pd.DataFrame({
            'PCA1': latent_healty_svd[:, 0],
            'PCA2': latent_healty_svd[:, 1],
            'Labels': clusterer.labels_,
            'OutlierScores': clusterer.outlier_scores_,
            'Dataset': 'Dataset 1'
        })

        df2 = pd.DataFrame({
            'PCA1': latent_train_unhealty_svd[:, 0],
            'PCA2': latent_train_unhealty_svd[:, 1],
            'Labels': hdbscan.approximate_predict(clusterer, latent_train_unhealty_svd)[0],
            'OutlierScores': hdbscan.approximate_predict(clusterer, latent_train_unhealty_svd)[1],
            'Dataset': 'Dataset 2'
        })

        df3 = pd.DataFrame({
            'PCA1': latent_valid_unhealty_svd[:, 0],
            'PCA2': latent_valid_unhealty_svd[:, 1],
            'Labels': hdbscan.approximate_predict(clusterer, latent_valid_unhealty_svd)[0],
            'OutlierScores': hdbscan.approximate_predict(clusterer, latent_valid_unhealty_svd)[1],
            'Dataset': 'Dataset 2'
        })

        # Combiniamo i due DataFrame
        combined_df = pd.concat([df1, df2,df3], ignore_index=True)

        # Visualizziamo il grafico degli outlier scores per entrambi i dataset sullo stesso asse
        fig = px.scatter(
            combined_df, x='PCA1', y='PCA2', color='OutlierScores', color_continuous_scale='Viridis',
            title='Scatter plot of PCA components colored by outlier scores',
            labels={'PCA1': 'PCA Component 1', 'PCA2': 'PCA Component 2', 'OutlierScores': 'Outlier Scores'}
        )
        fig.show()

    # CONFUSION MATRIX
    df1 = pd.DataFrame({
    'PCA1': latent_healty_svd[:, 0],
    'PCA2': latent_healty_svd[:, 1],
    'Labels': clusterer.labels_,
    'OutlierScores': clusterer.outlier_scores_,
    'Dataset': 'Dataset 1'
    })

    df2 = pd.DataFrame({
        'PCA1': latent_train_unhealty_svd[:, 0],
        'PCA2': latent_train_unhealty_svd[:, 1],
        'Labels': hdbscan.approximate_predict(clusterer, latent_train_unhealty_svd)[0],
        'OutlierScores': hdbscan.approximate_predict(clusterer, latent_train_unhealty_svd)[1],
        'Dataset': 'Dataset 2'
    })

    df3 = pd.DataFrame({
        'PCA1': latent_valid_unhealty_svd[:, 0],
        'PCA2': latent_valid_unhealty_svd[:, 1],
        'Labels': hdbscan.approximate_predict(clusterer, latent_valid_unhealty_svd)[0],
        'OutlierScores': hdbscan.approximate_predict(clusterer, latent_valid_unhealty_svd)[1],
        'Dataset': 'Dataset 2'
    })

    # Combine the DataFrames
    combined_df = pd.concat([df1, df2, df3], ignore_index=True)

    from sklearn.metrics import classification_report
    # Add the 'is_healthy' column
    combined_df['is_healthy'] = combined_df['Dataset'].apply(lambda x: 1 if x == 'Dataset 1' else 0)

    # Generate the labels based on the 10% outlier score threshold
    combined_df['chosen_treshold'] = combined_df['OutlierScores'] < threshold

    # Generate the classification report
    y_true = combined_df['is_healthy']
    y_pred = combined_df['chosen_treshold']

    report = classification_report(y_true, y_pred, target_names=['Unhealthy', 'Healthy'])
    print(f'Classification treshold for {threshold} as outlier score treshold')
    print(report)


def finalize_model(model_uuid,threshold):
    """
    Move all contents from src_folder to dest_folder.

    :param src_folder: Source folder path
    :param dest_folder: Destination folder path
    """
    if not os.path.exists('/content/MODEL_REGISTRY'):
        os.mkdir('/content/MODEL_REGISTRY')

    # Ensure the destination folder exists
    if not os.path.exists(os.path.join('/content/MODEL_REGISTRY',model_uuid)):
        os.makedirs(os.path.join('/content/MODEL_REGISTRY',model_uuid))

    # Move each item from the source folder to the destination folder
    for item in os.listdir(os.path.join('/content/EXPERIMENTS',model_uuid)):
        src_item = os.path.join(os.path.join('/content/EXPERIMENTS',model_uuid), item)
        dest_item = os.path.join(os.path.join('/content/MODEL_REGISTRY',model_uuid), item)
        shutil.copy(src_item, dest_item)

        # If the item is a .txt file, also copy it to the base destination folder
        if item.endswith('.txt'):
            shutil.copy(dest_item, '/content/MODEL_REGISTRY')

    # Ensure there is only one .txt file in the base destination folder
    for item in os.listdir('/content/MODEL_REGISTRY'):
        if item.endswith('.txt') and item != model_uuid + '.txt':
            os.remove(os.path.join('/content/MODEL_REGISTRY', item))

    with open(os.path.join('/content/MODEL_REGISTRY',model_uuid,'threshold.pkl'), 'wb') as f:
        pickle.dump(threshold, f)





In [7]:
evaluate('1f57efff0b0c4ac5b3b7f1771c86aef4',threshold=.2)

UUID Value: 1f57efff0b0c4ac5b3b7f1771c86aef4


Classification treshold for 0.2 as outlier score treshold
              precision    recall  f1-score   support

   Unhealthy       0.84      0.96      0.89      2048
     Healthy       0.96      0.83      0.89      2304

    accuracy                           0.89      4352
   macro avg       0.90      0.90      0.89      4352
weighted avg       0.90      0.89      0.89      4352



In [8]:
finalize_model('1f57efff0b0c4ac5b3b7f1771c86aef4',threshold=.2)

# PREDICT

In [9]:
def predict_function(input_json):
    """
    Search for a .txt file in the specified folder and return its name.

    """
    #SEARCH INTO MODEL REGISTRY TO GET WHICH ONE HAS BEEN FINALIZED
    for item in os.listdir('/content/MODEL_REGISTRY'):
        if item.endswith('.txt'):
            actual_uuid = item.split('.')[0]

    model_folder = os.path.join('/content/MODEL_REGISTRY',actual_uuid)

    with open(os.path.join(model_folder,'best_params.pkl'), 'rb') as f:
        best_params = pickle.load(f)

    vrae = VRAE(**best_params,dload=os.path.join('/content/MODEL_REGISTRY',actual_uuid))

    vrae.load(os.path.join('/content/MODEL_REGISTRY',actual_uuid,'vrae.pth'))

    latent_vector = vrae.predict_json(input_json)

    svd = joblib.load(os.path.join(model_folder,'svd.joblib'))

    latent_vector_svd = svd.transform(latent_vector)

    # load the model
    clusterer = joblib.load(os.path.join(model_folder,'clusterer.joblib'))
    # make predictions
    label, strengths = hdbscan.approximate_predict(clusterer, latent_vector_svd)

    with open(os.path.join('/content/MODEL_REGISTRY',actual_uuid,'threshold.pkl'), 'rb') as f:
        threshold = pickle.load(f)

    final_prediction = ['Healthy' if strengths<threshold else 'Unhealthy']

    return final_prediction

In [10]:
input_json = pd.read_csv('/content/INPUT/UCR_TS_Archive_2015/ECG5000/ECG5000_TEST',sep=",",header=None)[:1].to_json()
predict_function(input_json)


dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1


size_average and reduce args will be deprecated, please use reduction='sum' instead.



['Healthy']