In [1]:
from __future__ import print_function, division

import numpy as np
import pandas as pd
import sys

from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.utils.data
import torchvision

import spatial_vae.models as models
####data loading and making coordinates#####
mnist_test = np.load('D:/project/CMU/toytask/mnist_rotated/images_test.npy')
# img = Image.fromarray(mnist_test[55], 'L')
# img.save('my.png')
# img.show()
mnist_test = torch.from_numpy(mnist_test).float()/255 #normalized
n=m=28
xgrid = np.linspace(-1, 1, m)
#gives a list that contains m=28 evenly spaced values between -1 and 1
ygrid = np.linspace(1, -1, n)
x0,x1 = np.meshgrid(xgrid, ygrid)
#each point in y is paired with a point in x
x_coord = np.stack([x0.ravel(), x1.ravel()], 1)# xo.ravel rolls the 2d array x0 
x_coord = torch.from_numpy(x_coord).float()#784 points each represented by an x,y pair
#tensor of x and y coordinates each corresponding to one of the 784 pixel values
print(x_coord.shape)

torch.Size([784, 2])


2 list of evenly spaced numbers are created. Each number in 1list is associated with all the other numbers in another list
A coordinate frame is created, it has 784 points intended for 784 pixel values. Each point has 2 coordinate values

In [2]:
y_test = mnist_test.view(-1, n*m)#image pixel values for 10k flattened images
print(y_test.shape)

torch.Size([10000, 784])


In [3]:
use_cuda = torch.cuda.is_available()#check gpu availability and move data to gpu
if use_cuda:
    # y_train = y_train.cuda()
    y_test = y_test.cuda()
    x_coord = x_coord.cuda()

x_coord is our cartesian frame having 2 coordinates for 784 points per image
and y_test is our image data having 784 pixel values per image

In [4]:
data_test = torch.utils.data.TensorDataset(y_test)
#alternative to this would be a customized dataset class 
import sys
import torch.nn as nn
#latent value for
z_dim = 2
print('# training with z-dim:', z_dim, file=sys.stderr)#latent values

num_layers = 2
hidden_dim = 500
activation = nn.Tanh
print('# using the spatial generator architecture', file=sys.stderr)


# training with z-dim: 2
# using the spatial generator architecture


Inside the model arch z_dim is increased to 5 if rotational and translation inferences are to be done
Translation has 2 latent dims associated with it, possibly for accounting it in x0 and x1 directions
Rotation has only 1 because angular rotation can happen only in x0 x1 plane

In [5]:
ckpt_path1 = 'D:\\project\\CMU\\toytask\\spatial-VAE-master\\saved_models_generator_epoch2.sav'
ckpt_path2 = 'D:\\project\\CMU\\toytask\\spatial-VAE-master\\saved_models_inference_epoch2.sav'
#load the models
#pnet is the spatial vae and qnet is the inference network
p_net = torch.load(ckpt_path1)
q_net=torch.load(ckpt_path2)
#loaded the models including the model class structure etc because it was saved in the original script using torch.save
#good practice for saving and loading is to just save and load the state dicts because it doesnt deal with hard coded info
#and is flexible
if use_cuda:
    p_net.cuda()
    q_net.cuda()

dx_scale = 0.1
theta_prior = np.pi/4
#prior assumption about the values of rotation and translation
print('# using priors: theta={}, dx={}'.format(theta_prior, dx_scale), file=sys.stderr)


# using priors: theta=0.7853981633974483, dx=0.1


priors are our assumptions about how much rotation or translation is expected

In [6]:
N = len(mnist_test)

params = list(p_net.parameters()) + list(q_net.parameters())
lr = 1e-4
optim = torch.optim.Adam(params, lr=lr)
#define optimizer
minibatch_size = 100

test_iterator = torch.utils.data.DataLoader(data_test, batch_size=minibatch_size)
#data loader for dynamic loading


To be read in the order 1. eval_model 2. eval_minibatch. In eval_minibatch, we first use the inference network to get the latent distribution of the image using its higher representations. Then we do rotation and translation operations on our pre fixed coordinate/cartesian system by using samples from the inferenced latent distribution as the other operand in respective tasks. This establishes a relationship of dependence between transformed cartesian coordinates and latent distribution, which in my opinion explicitly reflects changes in one into the other. Using the transformed coordinates we reconstruct our image rotated and translated using the spatial vae model. The spatial vae model has learned a function that directly takes coordinate values and latent values to form the desired image.


In [7]:
def eval_minibatch(x, y, p_net, q_net, rotate=True, translate=True, dx_scale=0.1, theta_prior=np.pi, use_cuda=False):
    #y is a batch of images , x is the set of coordinates in cartesian system
    b = y.size(0)#batch size
    x = x.expand(b, x.size(0), x.size(1))#784pixels,2coords for each image in a batch of 100
    #does some replication of values to give a new view of tensor although the original tensor remains same
    # first do inference on the latent variables
    if use_cuda:
        y = y.cuda()

    z_mu,z_logstd = q_net(y)
    #get an estimate about the mean and std dev, and thus distriution,of latent vars, establishes a relation between latent variables and images 
    z_std = torch.exp(z_logstd) #exponential the log
    z_dim = z_mu.size(1)
    #z=5, 2 are unstructured,1 for translational, 2 for rotational ??idk why maybe they require 2 for mean n std

    # draw samples from variational posterior to calculate
    # E[p(x|z)]   
    r = Variable(x.data.new(b,z_dim).normal_())#find the difference between x.new and x.data.new 
    #creates an autograd tensor from a gaussian distr with mean 0 and std 1 havin the same data type as X and
    #shape as X's view in 100,5
    # 100,5 tensor initialized with normal distribution; autograd tensor
    #each point his sampled from this random distribution and projected to the latent distribution
    z = z_std*r + z_mu 
    #latent vector
    kl_div = 0
    if rotate:
        # z[0] is the rotation
        theta_mu = z_mu[:,0]#theta_mu and other theta variables are added to the computational graph
        theta_std = z_std[:,0]
        theta_logstd = z_logstd[:,0]
        theta = z[:,0]
        z = z[:,1:]#isolate the latent variable of rotation from others
        z_mu = z_mu[:,1:]
        z_std = z_std[:,1:]
        z_logstd = z_logstd[:,1:]

        # calculate rotation matrix
        rot = Variable(theta.data.new(b,2,2).zero_())#get zero tensors having same type as theta and size as mentioned
        rot[:,0,0] = torch.cos(theta)
        rot[:,0,1] = torch.sin(theta)
        rot[:,1,0] = -torch.sin(theta)
        rot[:,1,1] = torch.cos(theta)
        #to do the coordinate transformation by rotation
        x = torch.bmm(x, rot) # rotate coordinates by theta

        # calculate the KL divergence term
        sigma = theta_prior
        #loss term for inference on latent
        kl_div = -theta_logstd + np.log(sigma) + (theta_std**2 + theta_mu**2)/2/sigma**2 - 0.5

    if translate:
        # z[0,1] are the translations
        dx_mu = z_mu[:,:2]
        dx_std = z_std[:,:2]
        dx_logstd = z_logstd[:,:2]
        dx = z[:,:2]*dx_scale # scale dx by standard deviation
        dx = dx.unsqueeze(1)
        z = z[:,2:]

        x = x + dx # translate coordinates
    # reconstruct the image by making it depend on 784 rotated and translated cartesian coordinates + latent variables
    #784coords+5latent_vars input gives 784 pixel values
    y_hat = p_net(x.contiguous(), z)
    y_hat = y_hat.view(b, -1)
#     y_hat1 = y_hat.reshape(b,28,28).cpu().detach().numpy()
#     img = Image.fromarray(y_hat1[55], 'L')
#     img.save('my.png')
#     img.show()
    size = y.size(1)
    log_p_x_g_z = -F.binary_cross_entropy_with_logits(y_hat, y)*size
    z_kl = -z_logstd + 0.5*z_std**2 + 0.5*z_mu**2 - 0.5
    kl_div = kl_div + torch.sum(z_kl, 1)
    kl_div = kl_div.mean()
    
    elbo = log_p_x_g_z - kl_div

    return elbo, log_p_x_g_z, kl_div

In [8]:
def eval_model(iterator, x_coord, p_net, q_net, rotate=True, translate=True
              , dx_scale=0.1, theta_prior=np.pi, use_cuda=False):
    p_net.eval()
    q_net.eval()
    #iterator batch of 10k images 784 pixels each, x_coord is cartesian system 784 points with xandy cords
    c = 0
    gen_loss_accum = 0
    kl_loss_accum = 0
    elbo_accum = 0

    for y, in iterator:
        b = y.size(0)
        x = Variable(x_coord)# 784 points with 2 coordinates each 
        y = Variable(y) #batchsize,100 images 784 pixel values each
#         print(x.shape,y.shape)
        elbo, log_p_x_g_z, kl_div = eval_minibatch(x, y, p_net, q_net, rotate=rotate, translate=translate
                                                  , dx_scale=dx_scale, theta_prior=theta_prior
                                                  , use_cuda=use_cuda)

        elbo = elbo.item() #detaches tensors/losses from the computational graph so that they dont burden the computational processes 
        gen_loss = -log_p_x_g_z.item()
        kl_loss = kl_div.item()

        c += b
        delta = b*(gen_loss - gen_loss_accum)
        gen_loss_accum += delta/c

        delta = b*(elbo - elbo_accum)
        elbo_accum += delta/c

        delta = b*(kl_loss - kl_loss_accum)
        kl_loss_accum += delta/c

    return elbo_accum, gen_loss_accum, kl_loss_accum


In [9]:
output = sys.stdout
print('\t'.join(['Epoch', 'Split', 'ELBO', 'Error', 'KL']), file=output)

elbo_accum,gen_loss_accum,kl_loss_accum = eval_model(test_iterator, x_coord, p_net,q_net, rotate=True, translate=True,dx_scale=dx_scale, theta_prior=theta_prior,use_cuda=use_cuda)
line = '\t'.join(['test', str(elbo_accum), str(gen_loss_accum), str(kl_loss_accum)])
print(line, file=output)
output.flush()

Epoch	Split	ELBO	Error	KL
test	-215.1652920532226	206.2328868103028	8.932405633926392
