## This notebook contains Variational Recurrent Neural Network model for learning feature representation that helps to find changes points on time series data.

In [1]:
import os
import numpy as np
import scipy.io as sio
import math
import torch
import torch.nn as nn
import torch.utils
import torch.utils.data
from torchvision import datasets, transforms
from torch.autograd import Variable

import matplotlib.pyplot as plt 
from model import VRNN

## Preparing Data

In [2]:
window_size = 25 ## length of the sliding window to create samples/sequences
trn_ratio = 0.7
        

In [3]:
dataset = sio.loadmat('./beedance/beedance-6.mat')
        
Y = dataset['Y']                # Y: time series data, time length (T) x number of variables (D) => T x D
L = dataset['L']                # L: label of change-point, time length (T) x 1 (labeled by 0 or 1)
T, D = Y.shape                  # T: time length; D: number of variables
    
n_trn = int(np.ceil(T * trn_ratio))  # splitting point
print('Length of dataset:', T, 'Number of variables:', D, 'Train-Test splitting ratio:', n_trn)
    
train_set_idx = range(window_size, n_trn)
test_set_idx = range(n_trn, T)

print('number_of_training_samples:', len(train_set_idx), 'number_of_testing_samples:', len(test_set_idx)) ## number of train samples, number of test samples

Length of dataset: 608 Number of variables: 3 Train-Test splitting ratio: 426
number_of_training_samples: 401 number_of_testing_samples: 182


In [4]:
def train_test(idx, window_size, Y, L): 
    n = len(idx)
    A = torch.zeros((n, D))       
    B = torch.zeros((n, 1))
    X_p = torch.zeros((n, window_size, D)) ## past samples sequence set created by sliding window
    X_f = torch.zeros((n, window_size, D)) ## future samples sequence set created by sliding window

    for i in range(n):
        l = idx[i] - window_size
        m = idx[i]
        n = idx[i] - window_size + 1

        X_p[i, :, :] = torch.from_numpy(Y[l:m, :])
        X_f[i, :, :] = torch.from_numpy(Y[n:m+1, :])
            
        A[i, :] = torch.from_numpy(Y[m, :])
        
        B[i, :] = torch.from_numpy(L[m])
            

    data, future_data, true_data, labels = Variable(X_p), Variable(X_f), Variable(A), Variable(B)
            
    return data, future_data, true_data, labels

In [5]:
train_dataset, future_train, true_train, labels_train = train_test(train_set_idx, window_size, Y, L)
test_dataset, future_test, true_test, labels_test = train_test(test_set_idx, window_size, Y, L)

print('Shape of Sequenced Samples for Training:', train_dataset.size())
print('Shape of Sequenced Samples for Testing:', test_dataset.size())
print('Shape of Original Train Data:', true_train.size())
print('Shape of Original Test Data:', true_test.size())
print('Shape of Training Labels:', labels_train.size())
print('Shape of Testing Labels:', labels_test.size())

Shape of Sequenced Samples for Training: torch.Size([401, 25, 3])
Shape of Sequenced Samples for Testing: torch.Size([182, 25, 3])
Shape of Original Train Data: torch.Size([401, 3])
Shape of Original Test Data: torch.Size([182, 3])
Shape of Training Labels: torch.Size([401, 1])
Shape of Testing Labels: torch.Size([182, 1])


In [6]:
## For flattening observations in a single window

def flatten(data, idx, window_size, dim):
    return data.reshape((len(idx), window_size*D))

train_flat = flatten(train_dataset, train_set_idx, window_size, D)
test_flat = flatten(test_dataset, test_set_idx, window_size, D)
print(train_flat.size())
print(test_flat.size())


torch.Size([401, 75])
torch.Size([182, 75])


## Training

In [7]:
#hyperparameters
x_dim = 3
h_dim = 100
z_dim = 1
n_layers =  1
n_epochs = 100
clip = 10
learning_rate = 1e-3
batch_size = 64
seed = 128
print_every = 100
save_every = 10

#manual seed
torch.manual_seed(seed)
plt.ion()

In [8]:
## Neural Network
class phi_x(nn.Module):
    def __init__(self, x_dim, h_dim):
        super(phi_x, self).__init__()
        self.model=nn.Sequential(
            nn.Linear(x_dim, h_dim),
            nn.ReLU(),
            nn.Linear(h_dim, h_dim),
            nn.ReLU())
    
    def forward(self, x):
        for layer in self.model:
            x = layer(x)
        return x
    
class phi_z(nn.Module):
    def __init__(self, z_dim, h_dim):
        super(phi_z, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(z_dim, h_dim),
            nn.ReLU())

    def forward(self, x):
        for layer in self.model:
            x = layer(x)
        return x
    
class encoder(nn.Module):
    def __init__(self, h_dim, z_dim):
        super(encoder, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(h_dim + h_dim, h_dim),
            nn.ReLU(),
            nn.Linear(h_dim, z_dim),
            nn.ReLU())
        
    def forward(self, x):
        for layer in self.model:
            x = layer(x)
        return x

class decoder(nn.Module):
    def __init__(self, h_dim, x_dim):
        super(decoder, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(h_dim + h_dim, h_dim),
            nn.ReLU(),
            nn.Linear(h_dim, x_dim),
            nn.ReLU())
    
    def forward(self, x):
        for layer in self.model:
            x = layer(x)
        return x

In [9]:
class Model(nn.Module):
    def __init__(self, x_dim, h_dim, z_dim, n_layers, bias = False):
        super(Model, self).__init__()
        self.Phi_x = phi_x(x_dim, h_dim)
        self.Phi_z = phi_z(z_dim, h_dim)
        
        self.Encoder = encoder(h_dim, z_dim)
        self.Enc_mean = nn.Linear(z_dim, z_dim)
        self.Enc_std = nn.Sequential(
            nn.Linear(z_dim, z_dim),
            nn.Softplus())
        
        self.prior = nn.Sequential(
            nn.Linear(h_dim, z_dim),
            nn.ReLU())
        self.prior_mean = nn.Linear(z_dim, z_dim)
        self.prior_std = nn.Sequential(
            nn.Linear(z_dim, z_dim),
            nn.Softplus())

        self.Decoder = decoder(h_dim, x_dim)
        self.dec_std = nn.Sequential(
            nn.Linear(x_dim, x_dim),
            nn.Softplus())
        #self.dec_mean = nn.Linear(h_dim, x_dim)
        self.dec_mean = nn.Sequential(
            nn.Linear(x_dim, x_dim),
            nn.Sigmoid())
        
        self.rnn = nn.GRU(h_dim + h_dim, h_dim, n_layers, bias)
        
    def forward(self, x):

        all_enc_mean, all_enc_std, all_enc_t = [], [], []
        all_dec_mean, all_dec_std, all_dec_t = [], [], []
        kld_loss = 0
        nll_loss = 0

        h = Variable(torch.zeros(n_layers, x.size(1), h_dim))

        for t in range(x.size(0)):

            phi_x_t = self.Phi_x(x[t])

            #encoder
            enc_t = self.Encoder(torch.cat([phi_x_t, h[-1]], 1))
            enc_mean_t = self.Enc_mean(enc_t)
            enc_std_t = self.Enc_std(enc_t)

            #prior
            prior_t = self.prior(h[-1])
            prior_mean_t = self.prior_mean(prior_t)
            prior_std_t = self.prior_std(prior_t)

            #sampling and reparameterization
            z_t = self._reparameterized_sample(enc_mean_t, enc_std_t)
            phi_z_t = self.Phi_z(z_t)

            #decoder
            dec_t = self.Decoder(torch.cat([phi_z_t, h[-1]], 1))
            dec_mean_t = self.dec_mean(dec_t)
            dec_std_t = self.dec_std(dec_t)

            #recurrence
            _, h = self.rnn(torch.cat([phi_x_t, phi_z_t], 1).unsqueeze(0), h)

            #computing losses
            kld_loss += self._kld_gauss(enc_mean_t, enc_std_t, prior_mean_t, prior_std_t)
            #nll_loss += self._nll_gauss(dec_mean_t, dec_std_t, x[t])
            nll_loss += self._L2_norm(dec_mean_t, x[t])
            
            
            all_enc_t.append(enc_t)
            all_enc_std.append(enc_std_t)
            all_enc_mean.append(enc_mean_t)
            all_dec_t.append(dec_t)
            all_dec_mean.append(dec_mean_t)
            all_dec_std.append(dec_std_t)
        
        #print(all_enc_t)
        encoded = self._helper_dim(all_enc_t)
        #print("Encoded:", encoded)
        mmd_loss = self._mmd_loss(encoded)
        #print("MMD:", mmd_loss)
        #print("Enc:", encoded.size())
        
        return kld_loss, nll_loss, mmd_loss, \
            (all_enc_t, all_enc_mean, all_enc_std), \
            (all_dec_t, all_dec_mean, all_dec_std)
    
    ## To generate encoded or reconstructed sequence within training
    def sample(self, window_size):
        
        sample = torch.zeros(x.size(0), self.x_dim)
        
        h = Variable(torch.zeros(self.n_layers, x.size(1), self.h_dim))
        
        for t in range(x.size(0)):
            
            phi_x_t = self.Phi_x(x[t]) 
            
            #encoder
            enc_t = self.Encoder(torch.cat([phi_x_t, h[-1]], 1))
            enc_mean_t = self.Enc_mean(enc_t)
            enc_std_t = self.enc_std(enc_t)

            phi_z_t = self.Phi_z(enc_mean_t)
            
            #recurrence
            _, h = self.rnn(torch.cat([phi_x_t, phi_z_t], 1).unsqueeze(0), h)
            
            sample[t] = enc_mean_t.data
            
        return sample
            
    def _reparameterized_sample(self, mean, std):
        """using std to sample"""
        eps = torch.FloatTensor(std.size()).normal_()
        eps = Variable(eps)
        return eps.mul(std).add_(mean)


    def _kld_gauss(self, mean_1, std_1, mean_2, std_2):
        """Using std to compute KLD"""

        kld_element =  (2 * torch.log(std_2) - 2 * torch.log(std_1) + 
            (std_1.pow(2) + (mean_1 - mean_2).pow(2)) /
            std_2.pow(2) - 1)
        return 0.5 * torch.sum(kld_element)
    
    def _L2_norm(self, mean, x):    ##using L2 norm distance between decoder mean and x
        return torch.sum(torch.dist(x, mean, 2))
    
    ## Create past and future samples encoding 
    def _past_future(self, x):
        
        x_size = x.size(0)
        #print(x_size)
        dim = x.size(1)
        #print(dim)
        
        X_p = torch.zeros(x.size(0), x.size(1), x.size(2)) ## past samples sequence set created by sliding window
        X_f = torch.zeros(x.size(0), x.size(1), x.size(2)) ## future samples sequence set created by sliding window
        
        for i in range((x_size-1)):
            X_p[i, :, :] = x[i, :, :]
        
        for i in range(1, x_size):
            X_f[i, :, :] = x[i, :, :]
        
        #print("Past:", X_p)
        #print("Future:", X_f)
        
        return X_p, X_f
    
    ##helper to compute kernel for mmd
    def compute_kernel(self, x, y):
        dim = x.size(1)
                                          
        kernel_input = (x - y).pow(2).mean(2)/float(dim)
        return torch.exp(-kernel_input)
    
    ## computes mmd loss between current and future sample
    def _mmd_loss(self, x):
        
        X_p, X_f = self._past_future(x)
        p_kernel = self.compute_kernel(X_p,X_p)
        f_kernel = self.compute_kernel(X_f,X_f)
        pf_kernel = self.compute_kernel(X_p,X_f)
        mmd = p_kernel.mean() + f_kernel.mean() - 2*pf_kernel.mean()
        
        return torch.sum(mmd)
    
     ## computes kmeans loss between encoded samples and cluster centroids
    def _kmeans_loss(self, x):
        kmeans = KMeans(n_cluster = 8, random_state = 0).fit(x)
        loss = kmeans.inertia_
        return torch.sum(loss) 
    
    
    def _helper_dim(self, x):
        x = [t.detach().numpy() for t in x]
        x = torch.FloatTensor(x)
        x = Variable(x.transpose(1, 0))
        #print("Enc:", x.size())
        
        return x
        
    

In [10]:
def train(epoch):
    train_loss, total_kld_loss, total_nll_loss, total_mmd_loss = 0, 0, 0, 0
    for batch_idx, data in enumerate(train_loader):
        
        #print(data.size())
        data = Variable(data.squeeze().transpose(0, 1))
        #print(data.size())
        #data = (data - data.min().data) / (data.max().data - data.min().data)

        #forward + backward + optimize
        optimizer.zero_grad()
        kld_loss, nll_loss, mmd_loss, _, _ = model(data)
        loss = kld_loss + nll_loss - mmd_loss
        loss.backward()
        optimizer.step()

        #grad norm clipping, only in pytorch version >= 1.10
        nn.utils.clip_grad_norm_(model.parameters(), clip)

        train_loss += loss.data
        total_kld_loss += kld_loss.data
        total_nll_loss += nll_loss.data
        total_mmd_loss +=mmd_loss.data
    
        #printing
        
        print('Train Epoch: {} Batch: {} KLD Loss: {:.6f} \t NLL Loss: {:.6f} \t MMD Loss: {:.6f}'.format(
        epoch, batch_idx, kld_loss.data / batch_size, nll_loss.data / batch_size, mmd_loss.data / batch_size))

        #sample = model.sample(28)
        #plt.imshow(sample.numpy())
        #plt.pause(1e-6)
    
    print('====> Epoch: {} Train set loss: KLD Loss: {:.6f} \t NLL Loss: {:.6f} \t MMD Loss: {:.6f} \t Average loss: {:.4f}'.format(
        epoch, total_kld_loss/len(train_loader.dataset), total_nll_loss/len(train_loader.dataset), total_mmd_loss/len(train_loader.dataset), train_loss / len(train_loader.dataset)))

In [11]:
def test(epoch):

    mean_kld_loss, mean_nll_loss, mean_mmd_loss = 0, 0, 0
    for batch_idx, data in enumerate(test_loader):                                            

        #data = Variable(data)
        data = Variable(data.squeeze().transpose(0, 1))
        #data = (data - data.min().data) / (data.max().data - data.min().data)

        kld_loss, nll_loss, mmd_loss, _, _ = model(data)
        
        mean_kld_loss += kld_loss.data
        mean_nll_loss += nll_loss.data
        mean_mmd_loss += mmd_loss.data

    mean_kld_loss /= len(test_loader.dataset)
    mean_nll_loss /= len(test_loader.dataset)
    mean_mmd_loss /= len(test_loader.dataset)

    print('====> Test set loss: KLD Loss = {:.4f} \t NLL Loss = {:.4f} \t MMD Loss = {:.4f}'.format(
    mean_kld_loss, mean_nll_loss, mean_mmd_loss))


In [12]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last = True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last = True)

In [13]:
model = Model(x_dim, h_dim, z_dim, n_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
for epoch in range(1, n_epochs + 1):

    #training + testing
    train(epoch)
    test(epoch)

    #saving model
    if epoch % save_every == 1:
        fn = 'saves/vrnn_state_dict_'+ str(epoch)+'.pth'
        checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict()}
        torch.save(checkpoint, fn)
        print('Saved model to '+ fn)

Train Epoch: 1 Batch: 0 KLD Loss: 56.953384 	 NLL Loss: 1.530206 	 MMD Loss: 0.000000
Train Epoch: 1 Batch: 1 KLD Loss: 41.092838 	 NLL Loss: 1.397996 	 MMD Loss: 0.000000
Train Epoch: 1 Batch: 2 KLD Loss: 40.800335 	 NLL Loss: 1.284521 	 MMD Loss: 0.000001
Train Epoch: 1 Batch: 3 KLD Loss: 40.542126 	 NLL Loss: 1.192774 	 MMD Loss: 0.000001
Train Epoch: 1 Batch: 4 KLD Loss: 40.230148 	 NLL Loss: 1.090812 	 MMD Loss: 0.000001
Train Epoch: 1 Batch: 5 KLD Loss: 39.937630 	 NLL Loss: 1.102498 	 MMD Loss: 0.000000
====> Epoch: 1 Train set loss: KLD Loss: 41.425468 	 NLL Loss: 1.212777 	 MMD Loss: 0.000000 	 Average loss: 42.6382
====> Test set loss: KLD Loss = 27.9124 	 NLL Loss = 0.8199 	 MMD Loss = 0.0000
Saved model to saves/vrnn_state_dict_1.pth
Train Epoch: 2 Batch: 0 KLD Loss: 39.664265 	 NLL Loss: 1.443227 	 MMD Loss: 0.000000
Train Epoch: 2 Batch: 1 KLD Loss: 39.448792 	 NLL Loss: 1.301998 	 MMD Loss: 0.000000
Train Epoch: 2 Batch: 2 KLD Loss: 39.158455 	 NLL Loss: 1.221601 	 MMD L

Train Epoch: 12 Batch: 3 KLD Loss: 24.281490 	 NLL Loss: 1.061072 	 MMD Loss: 0.000005
Train Epoch: 12 Batch: 4 KLD Loss: 24.060392 	 NLL Loss: 1.000438 	 MMD Loss: 0.000005
Train Epoch: 12 Batch: 5 KLD Loss: 23.843250 	 NLL Loss: 1.038216 	 MMD Loss: 0.000006
====> Epoch: 12 Train set loss: KLD Loss: 23.354502 	 NLL Loss: 1.118296 	 MMD Loss: 0.000005 	 Average loss: 24.4728
====> Test set loss: KLD Loss = 16.6185 	 NLL Loss = 0.7949 	 MMD Loss = 0.0000
Train Epoch: 13 Batch: 0 KLD Loss: 23.636600 	 NLL Loss: 1.426310 	 MMD Loss: 0.000007
Train Epoch: 13 Batch: 1 KLD Loss: 23.415993 	 NLL Loss: 1.275309 	 MMD Loss: 0.000007
Train Epoch: 13 Batch: 2 KLD Loss: 23.200533 	 NLL Loss: 1.185202 	 MMD Loss: 0.000006
Train Epoch: 13 Batch: 3 KLD Loss: 22.991215 	 NLL Loss: 1.059527 	 MMD Loss: 0.000006
Train Epoch: 13 Batch: 4 KLD Loss: 22.780973 	 NLL Loss: 1.003762 	 MMD Loss: 0.000006
Train Epoch: 13 Batch: 5 KLD Loss: 22.568144 	 NLL Loss: 1.040017 	 MMD Loss: 0.000007
====> Epoch: 13 Tra

Train Epoch: 24 Batch: 0 KLD Loss: 11.719301 	 NLL Loss: 1.325171 	 MMD Loss: 0.000021
Train Epoch: 24 Batch: 1 KLD Loss: 11.534922 	 NLL Loss: 1.149005 	 MMD Loss: 0.000022
Train Epoch: 24 Batch: 2 KLD Loss: 11.332535 	 NLL Loss: 1.098651 	 MMD Loss: 0.000020
Train Epoch: 24 Batch: 3 KLD Loss: 11.191261 	 NLL Loss: 1.011583 	 MMD Loss: 0.000018
Train Epoch: 24 Batch: 4 KLD Loss: 11.074192 	 NLL Loss: 0.985517 	 MMD Loss: 0.000017
Train Epoch: 24 Batch: 5 KLD Loss: 10.925132 	 NLL Loss: 1.007060 	 MMD Loss: 0.000017
====> Epoch: 24 Train set loss: KLD Loss: 10.817331 	 NLL Loss: 1.049694 	 MMD Loss: 0.000018 	 Average loss: 11.8670
====> Test set loss: KLD Loss = 7.5661 	 NLL Loss = 0.7583 	 MMD Loss = 0.0000
Train Epoch: 25 Batch: 0 KLD Loss: 10.769022 	 NLL Loss: 1.297104 	 MMD Loss: 0.000018
Train Epoch: 25 Batch: 1 KLD Loss: 10.614747 	 NLL Loss: 1.120022 	 MMD Loss: 0.000019
Train Epoch: 25 Batch: 2 KLD Loss: 10.496508 	 NLL Loss: 1.080453 	 MMD Loss: 0.000022
Train Epoch: 25 Batc

Train Epoch: 35 Batch: 4 KLD Loss: 4.000736 	 NLL Loss: 0.846259 	 MMD Loss: 0.000039
Train Epoch: 35 Batch: 5 KLD Loss: 3.915587 	 NLL Loss: 0.900674 	 MMD Loss: 0.000037
====> Epoch: 35 Train set loss: KLD Loss: 3.924433 	 NLL Loss: 0.840987 	 MMD Loss: 0.000034 	 Average loss: 4.7654
====> Test set loss: KLD Loss = 2.6959 	 NLL Loss = 0.6676 	 MMD Loss = 0.0000
Train Epoch: 36 Batch: 0 KLD Loss: 3.833172 	 NLL Loss: 0.963902 	 MMD Loss: 0.000035
Train Epoch: 36 Batch: 1 KLD Loss: 3.772415 	 NLL Loss: 0.753271 	 MMD Loss: 0.000033
Train Epoch: 36 Batch: 2 KLD Loss: 3.694412 	 NLL Loss: 0.897574 	 MMD Loss: 0.000034
Train Epoch: 36 Batch: 3 KLD Loss: 3.619857 	 NLL Loss: 0.888873 	 MMD Loss: 0.000036
Train Epoch: 36 Batch: 4 KLD Loss: 3.551162 	 NLL Loss: 0.845275 	 MMD Loss: 0.000036
Train Epoch: 36 Batch: 5 KLD Loss: 3.484432 	 NLL Loss: 0.899034 	 MMD Loss: 0.000037
====> Epoch: 36 Train set loss: KLD Loss: 3.504112 	 NLL Loss: 0.837575 	 MMD Loss: 0.000034 	 Average loss: 4.3417
=

Train Epoch: 47 Batch: 0 KLD Loss: 0.888908 	 NLL Loss: 0.941252 	 MMD Loss: 0.000049
Train Epoch: 47 Batch: 1 KLD Loss: 0.866101 	 NLL Loss: 0.727667 	 MMD Loss: 0.000049
Train Epoch: 47 Batch: 2 KLD Loss: 0.843526 	 NLL Loss: 0.868863 	 MMD Loss: 0.000049
Train Epoch: 47 Batch: 3 KLD Loss: 0.821734 	 NLL Loss: 0.861617 	 MMD Loss: 0.000050
Train Epoch: 47 Batch: 4 KLD Loss: 0.800369 	 NLL Loss: 0.829856 	 MMD Loss: 0.000050
Train Epoch: 47 Batch: 5 KLD Loss: 0.779397 	 NLL Loss: 0.884625 	 MMD Loss: 0.000050
====> Epoch: 47 Train set loss: KLD Loss: 0.798011 	 NLL Loss: 0.816180 	 MMD Loss: 0.000047 	 Average loss: 1.6141
====> Test set loss: KLD Loss = 0.5342 	 NLL Loss = 0.6572 	 MMD Loss = 0.0000
Train Epoch: 48 Batch: 0 KLD Loss: 0.759011 	 NLL Loss: 0.940294 	 MMD Loss: 0.000050
Train Epoch: 48 Batch: 1 KLD Loss: 0.738802 	 NLL Loss: 0.726675 	 MMD Loss: 0.000050
Train Epoch: 48 Batch: 2 KLD Loss: 0.719215 	 NLL Loss: 0.867579 	 MMD Loss: 0.000050
Train Epoch: 48 Batch: 3 KLD Lo

Train Epoch: 58 Batch: 4 KLD Loss: 0.106506 	 NLL Loss: 0.821373 	 MMD Loss: 0.000057
Train Epoch: 58 Batch: 5 KLD Loss: 0.103187 	 NLL Loss: 0.872757 	 MMD Loss: 0.000057
====> Epoch: 58 Train set loss: KLD Loss: 0.108214 	 NLL Loss: 0.806487 	 MMD Loss: 0.000054 	 Average loss: 0.9146
====> Test set loss: KLD Loss = 0.0705 	 NLL Loss = 0.6499 	 MMD Loss = 0.0000
Train Epoch: 59 Batch: 0 KLD Loss: 0.104142 	 NLL Loss: 0.939722 	 MMD Loss: 0.000059
Train Epoch: 59 Batch: 1 KLD Loss: 0.098063 	 NLL Loss: 0.718555 	 MMD Loss: 0.000059
Train Epoch: 59 Batch: 2 KLD Loss: 0.093085 	 NLL Loss: 0.855033 	 MMD Loss: 0.000057
Train Epoch: 59 Batch: 3 KLD Loss: 0.093495 	 NLL Loss: 0.843224 	 MMD Loss: 0.000056
Train Epoch: 59 Batch: 4 KLD Loss: 0.091079 	 NLL Loss: 0.820722 	 MMD Loss: 0.000055
Train Epoch: 59 Batch: 5 KLD Loss: 0.084528 	 NLL Loss: 0.871667 	 MMD Loss: 0.000056
====> Epoch: 59 Train set loss: KLD Loss: 0.090077 	 NLL Loss: 0.805813 	 MMD Loss: 0.000055 	 Average loss: 0.8958
=

Train Epoch: 70 Batch: 0 KLD Loss: 0.022416 	 NLL Loss: 0.945748 	 MMD Loss: 0.000063
Train Epoch: 70 Batch: 1 KLD Loss: 0.019717 	 NLL Loss: 0.713291 	 MMD Loss: 0.000064
Train Epoch: 70 Batch: 2 KLD Loss: 0.009653 	 NLL Loss: 0.846787 	 MMD Loss: 0.000061
Train Epoch: 70 Batch: 3 KLD Loss: 0.009097 	 NLL Loss: 0.829514 	 MMD Loss: 0.000059
Train Epoch: 70 Batch: 4 KLD Loss: 0.012992 	 NLL Loss: 0.814832 	 MMD Loss: 0.000057
Train Epoch: 70 Batch: 5 KLD Loss: 0.011830 	 NLL Loss: 0.860675 	 MMD Loss: 0.000058
====> Epoch: 70 Train set loss: KLD Loss: 0.013679 	 NLL Loss: 0.799736 	 MMD Loss: 0.000058 	 Average loss: 0.8134
====> Test set loss: KLD Loss = 0.0051 	 NLL Loss = 0.6426 	 MMD Loss = 0.0000
Train Epoch: 71 Batch: 0 KLD Loss: 0.008385 	 NLL Loss: 0.944848 	 MMD Loss: 0.000059
Train Epoch: 71 Batch: 1 KLD Loss: 0.006022 	 NLL Loss: 0.712709 	 MMD Loss: 0.000060
Train Epoch: 71 Batch: 2 KLD Loss: 0.008511 	 NLL Loss: 0.846378 	 MMD Loss: 0.000061
Train Epoch: 71 Batch: 3 KLD Lo

Train Epoch: 81 Batch: 5 KLD Loss: 0.002607 	 NLL Loss: 0.850212 	 MMD Loss: 0.000059
====> Epoch: 81 Train set loss: KLD Loss: 0.003259 	 NLL Loss: 0.794560 	 MMD Loss: 0.000058 	 Average loss: 0.7978
====> Test set loss: KLD Loss = 0.0008 	 NLL Loss = 0.6361 	 MMD Loss = 0.0000
Saved model to saves/vrnn_state_dict_81.pth
Train Epoch: 82 Batch: 0 KLD Loss: 0.002323 	 NLL Loss: 0.951113 	 MMD Loss: 0.000060
Train Epoch: 82 Batch: 1 KLD Loss: 0.000548 	 NLL Loss: 0.708559 	 MMD Loss: 0.000060
Train Epoch: 82 Batch: 2 KLD Loss: 0.002197 	 NLL Loss: 0.840438 	 MMD Loss: 0.000062
Train Epoch: 82 Batch: 3 KLD Loss: 0.005367 	 NLL Loss: 0.816325 	 MMD Loss: 0.000063
Train Epoch: 82 Batch: 4 KLD Loss: 0.004191 	 NLL Loss: 0.809638 	 MMD Loss: 0.000063
Train Epoch: 82 Batch: 5 KLD Loss: 0.000702 	 NLL Loss: 0.849240 	 MMD Loss: 0.000061
====> Epoch: 82 Train set loss: KLD Loss: 0.002446 	 NLL Loss: 0.794065 	 MMD Loss: 0.000059 	 Average loss: 0.7965
====> Test set loss: KLD Loss = 0.0011 	 NL

Train Epoch: 93 Batch: 2 KLD Loss: 0.013575 	 NLL Loss: 0.835929 	 MMD Loss: 0.000064
Train Epoch: 93 Batch: 3 KLD Loss: 0.019387 	 NLL Loss: 0.805880 	 MMD Loss: 0.000065
Train Epoch: 93 Batch: 4 KLD Loss: 0.008313 	 NLL Loss: 0.805622 	 MMD Loss: 0.000064
Train Epoch: 93 Batch: 5 KLD Loss: 0.001281 	 NLL Loss: 0.839776 	 MMD Loss: 0.000061
====> Epoch: 93 Train set loss: KLD Loss: 0.007398 	 NLL Loss: 0.789860 	 MMD Loss: 0.000060 	 Average loss: 0.7972
====> Test set loss: KLD Loss = 0.0047 	 NLL Loss = 0.6293 	 MMD Loss = 0.0000
Train Epoch: 94 Batch: 0 KLD Loss: 0.048024 	 NLL Loss: 0.956904 	 MMD Loss: 0.000056
Train Epoch: 94 Batch: 1 KLD Loss: 0.034891 	 NLL Loss: 0.705314 	 MMD Loss: 0.000055
Train Epoch: 94 Batch: 2 KLD Loss: 0.003105 	 NLL Loss: 0.835421 	 MMD Loss: 0.000059
Train Epoch: 94 Batch: 3 KLD Loss: 0.009287 	 NLL Loss: 0.805013 	 MMD Loss: 0.000064
Train Epoch: 94 Batch: 4 KLD Loss: 0.023708 	 NLL Loss: 0.805266 	 MMD Loss: 0.000066
Train Epoch: 94 Batch: 5 KLD Lo