## Model: Regular Autoencoder with contractive loss (3 versions)

In [1]:
import torch
from torch import nn
from torch import optim
from torch.autograd import grad, Variable,functional
import torchvision
import torchvision.transforms as transforms


# ---------------------------------Model----------------------------------------------
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
class Reshape(nn.Module):
    def forward(self, input):
        return input.view(-1,512,8,8)

class Regular_AE(nn.Module):
    def __init__(self,laten_dims=64):
        super(Regular_AE, self).__init__()
        self.laten_dims = laten_dims
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 128, 4, stride=2, padding=1), 
            nn.BatchNorm2d(128),           
            nn.ReLU(),
            nn.Conv2d(128, 256, 4, stride=2, padding=1),     
            nn.BatchNorm2d(256),     
            nn.ReLU(),
			nn.Conv2d(256, 512, 4, stride=2, padding=1),  
            nn.BatchNorm2d(512),         
            nn.ReLU(),
            Flatten(),
            nn.Linear(4*4*512,self.laten_dims)
        )
        self.decoder = nn.Sequential(
            nn.Linear(laten_dims,8*8*512),
            Reshape(),
			nn.ConvTranspose2d(512, 256, 4, stride=2, padding=1),  # [batch, 24, 8, 8]
            nn.BatchNorm2d(256),
            nn.ReLU(),
			nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1),  # [batch, 12, 16, 16]
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.ConvTranspose2d(128, 3, 1, stride=1, padding=0),   # [batch, 3, 32, 32]
            nn.Sigmoid(),
        )

        self.classifer = nn.Sequential(
            nn.Linear(self.laten_dims,64),
            nn.Linear(64,32),
            nn.Linear(32,10)
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        prediction = self.classifer(encoded)
        return encoded, decoded, prediction
    



# Contractive loss -- three versions 
### The first one calculates the loss element by element
### The second one calculates the loss by $torch.autograd.grad(output,inputs,grad_output = torch.ones(output.size()))$ to make it calculate derivatives from a tensor to a tensor
### The third one calculates the loss by $torch.autograd.functional.jacobian$


In [14]:
def ctr_lossv1(x, encoding):
    contractive_loss = 0.0
    for encoding_i in encoding:
        grads = grad(encoding_i,x,create_graph=True)    
        print(grads[0].shape)
        list_grad.append(grads[0]) 
        contractive_loss  += sum([grd.norm()**2 for grd in grads[0]])
    return contractive_loss

def ctr_sum(x_batch,encoding_batch):
    sum_ctr = 0.0
    for i in range(len(x_batch)):
        sum_ctr += ctr_lossv1(x_batch,encoding_batch[i])
    return sum_ctr

def ctr_lossv2(x, encoding):
    contractive_loss = 0.0
    grads = grad(encoding,x,torch.ones(encoding.size()),create_graph=True)
    contractive_loss  += sum([grd.norm()**2 for grd in grads[0]])

    return contractive_loss

def ctr_lossv3(x,function):
    matrix = functional.jacobian(function,x,create_graph=True)
    return sum([grd.norm()**2 for grd in matrix])



### $z = [z_1, z_2, ...z_{latent\ dims}]$ --> $z = z_1+z_2+...+z_{latent\ dims}$

### Test the loss functions


In [15]:
net = Regular_AE()
list_grad = []
# batchsize = 4

# transform = transforms.Compose([
#     transforms.ToTensor(),
#     # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
#     ])

# trainset = torchvision.datasets.CIFAR10(root='./data÷sets',train=True, download=False,transform=transform)
# trainloader = torch.utils.data.DataLoader(trainset, batch_size=batchsize, shuffle=True) #working in Windows, no need for num_workers = 2

criterion = nn.BCELoss()
criterion2 = nn.CrossEntropyLoss()
num_epochs = 100
wd = 5e-04
optimizer = optim.Adam(net.parameters(),weight_decay=wd, lr=0.001)


# for i, data in enumerate(trainloader,0):
x = Variable(torch.rand((2,3,32,32)), requires_grad=True)            
optimizer.zero_grad()
encoding, decoding, prediction = net(x)


fc = ctr_sum(x,encoding)
fc2 = ctr_lossv2(x, encoding)
fc3 = ctr_lossv3(x, net.encoder)
fc3.backward()

recons_loss = criterion(decoding,x.detach())
# loss = criterion2(prediction,torch.ones(2)) 
loss = recons_loss
# print(x.grad)
loss.backward()
# print(x.grad)
x.grad = None
# Before the optimizer, x.grad needs to be None
optimizer.step()




print(fc)
print(fc2)
print(fc3)



torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
torch.Size([2, 3, 32, 32])
t

In [13]:
print(list_grad[1])

tensor([[[[-2.6569e-04, -2.5591e-04, -9.0400e-03,  ...,  1.3941e-03,
            1.6072e-03, -4.3095e-03],
          [-7.9071e-03, -1.4946e-03, -1.1981e-02,  ..., -1.5065e-03,
           -1.0530e-02,  1.7524e-02],
          [-2.4777e-03,  1.8861e-03, -8.7394e-03,  ...,  1.9678e-02,
            7.5352e-03, -1.8885e-02],
          ...,
          [ 1.6650e-02, -1.7192e-02,  2.0289e-02,  ...,  1.7918e-02,
           -5.4475e-03, -7.1331e-03],
          [-2.6177e-03,  2.8667e-02, -3.4633e-02,  ..., -2.5768e-02,
           -4.5416e-03, -3.6562e-03],
          [-3.4567e-03,  4.4974e-03, -2.1725e-03,  ..., -6.8672e-03,
            6.8729e-03, -4.0374e-03]],

         [[ 3.0583e-03,  9.0307e-03,  1.3011e-02,  ..., -9.4028e-03,
           -2.4438e-03, -1.7759e-03],
          [-2.6144e-03,  2.7208e-02,  4.3871e-03,  ..., -1.2303e-03,
           -1.4556e-02,  1.6034e-03],
          [-1.0238e-02,  4.1496e-03, -4.6915e-02,  ...,  1.3503e-02,
            1.8471e-02, -9.5419e-03],
          ...,
     

### 1.The batch size cannot be large, otherwise the GPU is out of memory (Heavy computation)
### 2. Version 2 is not correct; it calcultes $\partial (z_1+z_2+...+z_{dim}) /\partial x_i$, where $z$ is a scalar not a vector. The size of the calculated gradients is the same as the input's size
### 3. The difference between the losses calculated by version 1 and version 3 is small when the batch size is small.
#### What to notice is: the gradients of inputs need to be cleaned before $optimizer.step()$, otherwise the inputs will be updated, just like adding adversarial perturbations.

In [23]:
batchsize = 16

transform = transforms.Compose([
    transforms.ToTensor(),
    # transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
    ])

# trainset = torchvision.datasets.CIFAR10(root='./data÷sets',train=True, download=False,transform=transform)
# trainloader = torch.utils.data.DataLoader(trainset, batch_size=batchsize, shuffle=True) #working in Windows, no need for num_workers = 2

criterion = nn.BCELoss()
criterion2 = nn.CrossEntropyLoss()
num_epochs = 100
wd = 5e-04
optimizer = optim.Adam(net.parameters(),weight_decay=wd, lr=0.001)


# for i, data in enumerate(trainloader,0):
x = Variable(torch.rand((batchsize,3,32,32)), requires_grad=True)            
optimizer.zero_grad()
encoding, decoding, prediction = net(x)


fc = ctr_sum(x,encoding)
fc2 = ctr_lossv2(x, encoding)
fc3 = ctr_lossv3(x, net.encoder)
fc3.backward()

recons_loss = criterion(decoding,x.detach())
# loss = criterion2(prediction,torch.ones(2)) 
loss = recons_loss
# print(x.grad)
loss.backward()
# print(x.grad)
x.grad = None
# Before the optimizer, x.grad needs to be None
optimizer.step()



In [25]:
print(fc)
print(fc2)
print(fc3)

tensor(149.8658, grad_fn=<AddBackward0>)
tensor(131.9954, grad_fn=<AddBackward0>)
tensor(149.5394, grad_fn=<AddBackward0>)


## Another Observation: 
### The contractive loss is calulated from $\frac{\partial encoding_{x_i}}{\partial x_i}$ 
### And $\frac{\partial encoding_{x_i}}{\partial x_j} = 0$, where $i \neq j$
### But it is observed that the derivative of the encoding of image $x_i$ with respect to other images in the same batch $x_j$  is not zero. 
### Therefore, it raises a problem: whether to force images in the same batch belonging to the same category or just similar to contrastive loss, to make images with different augmentations in the same batch?