### Ladder Network on CIFAR10 ###
1.Iswariya Manivannan<br>
2.Sathiya Ramesh

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import copy

import torchvision
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torchvision.models as models
import torch.utils.data as utils_data
from torch.utils.data.sampler import SubsetRandomSampler

%matplotlib inline
import matplotlib.pyplot as plt 
import numpy as np

In [2]:
cuda0 = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(cuda0)
print(torch.cuda.is_available())

cuda:1
True


In [3]:
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = dsets.CIFAR10('/opt/datasets/cifar10', train=True, download=True, transform=transform)
testset = dsets.CIFAR10('/opt/datasets/cifar10', train=False, download=True, transform=transform)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
EPOCHS = 10
BATCH_SIZE = 32

In [5]:
trainloader = utils_data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)
testloader = utils_data.DataLoader(testset, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

Combinator function as per https://arxiv.org/pdf/1507.02672.pdf page 7.

In [6]:
class Combinator(nn.Module):
    
    def __init__(self, C):
        
        super(Combinator, self).__init__()
        
        self.b_0 = nn.Parameter(torch.zeros(1, C, 1, 1))
        self.b_1 = nn.Parameter(torch.zeros(1, C, 1, 1))
        self.w_0z = nn.Parameter(torch.ones(1, C, 1, 1))
        self.w_1z = nn.Parameter(torch.ones(1, C, 1, 1))
        self.w_0u = nn.Parameter(torch.zeros(1, C, 1, 1))
        self.w_1u = nn.Parameter(torch.zeros(1, C, 1, 1))
        self.w_0zu = nn.Parameter(torch.zeros(1, C, 1, 1))
        self.w_1zu = nn.Parameter(torch.zeros(1, C, 1, 1))
        self.w_sig = nn.Parameter(torch.ones(1, C, 1, 1))

    def forward(self, zl, ul):
        
        
        tem = self.b_1.repeat(zl.size(0), 1, 1, 1) + self.w_1z.repeat(zl.size(0), 1,1,1) * zl \
              + self.w_1u.repeat(zl.size(0), 1,1,1) * ul + self.w_1zu.repeat(zl.size(0), 1,1,1) * zl * ul
        out = self.b_0.repeat(zl.size(0), 1,1,1) + self.w_0z.repeat(zl.size(0), 1,1,1) * zl + \
              self.w_0u.repeat(zl.size(0), 1,1,1) * ul + self.w_0zu.repeat(zl.size(0), 1,1,1) * zl * ul + tem.sigmoid_()
        return out
        



Ladder Network architecture for CIFAR1 as per "Conv-large model" in https://arxiv.org/pdf/1507.02672.pdf page 17

In [7]:
class LadderNetwork(nn.Module):
    
    def __init__(self):
        
        super(LadderNetwork, self).__init__()
        
        self._noise_factor = 0.3
        
    #Encoder layers common for both noisy encoder and clean encoder
    
        self._bnorm0 = nn.BatchNorm2d(3)
        self._conv1 = nn.Conv2d(3, 96, 3, 1, 1)
        self._bnorm1 = nn.BatchNorm2d(96)
        self._bnorm_corr_1 = nn.BatchNorm2d(96)
        self._lrelu1 = nn.LeakyReLU()
        
        self._conv2 = nn.Conv2d(96, 96, 3, 1, 1)
        self._bnorm2 = nn.BatchNorm2d(96)
        self._bnorm_corr_2 = nn.BatchNorm2d(96)
        self._lrelu2 = nn.LeakyReLU()
        
        self._conv3 = nn.Conv2d(96, 192, 3, 1, 1)
        self._bnorm3 = nn.BatchNorm2d(192)
        self._bnorm_corr_3 = nn.BatchNorm2d(192)
        self._lrelu3 = nn.LeakyReLU()
        self._mpool3 = nn.MaxPool2d(2)
        
        
        self._conv4 = nn.Conv2d(192, 192, 3, 1, 1)
        self._bnorm4 = nn.BatchNorm2d(192)
        self._bnorm_corr_4 = nn.BatchNorm2d(192)
        self._lrelu4 = nn.LeakyReLU()
        
        self._conv5 = nn.Conv2d(192, 192, 3, 1, 1)
        self._bnorm5 = nn.BatchNorm2d(192)
        self._bnorm_corr_5 = nn.BatchNorm2d(192)
        self._lrelu5 = nn.LeakyReLU()
        
        self._conv6 = nn.Conv2d(192, 192, 3, 1, 1)
        self._bnorm6 = nn.BatchNorm2d(192)
        self._bnorm_corr_6 = nn.BatchNorm2d(192)
        self._lrelu6 = nn.LeakyReLU()
        self._mpool6 = nn.MaxPool2d(2)
        
        
        self._conv7 = nn.Conv2d(192, 192, 3, 1, 1)
        self._bnorm7 = nn.BatchNorm2d(192)
        self._bnorm_corr_7 = nn.BatchNorm2d(192)
        self._lrelu7 = nn.LeakyReLU()
        
        self._conv8 = nn.Conv2d(192, 192, 1, 1)
        self._bnorm8 = nn.BatchNorm2d(192)
        self._bnorm_corr_8 = nn.BatchNorm2d(192)
        self._lrelu8 = nn.LeakyReLU()
        
        self._conv9 = nn.Conv2d(192, 10, 1, 1)
        self._bnorm9 = nn.BatchNorm2d(10)
        self._bnorm_corr_9 = nn.BatchNorm2d(10)
        self._lrelu9 = nn.LeakyReLU()
        
        
        
        # Decoder layers
        
        self._tconv9 = nn.ConvTranspose2d(10, 192, 1, 1)
        self._tlrelu9 = nn.LeakyReLU()
        
        self._tconv8 = nn.ConvTranspose2d(192, 192, 1, 1)
        self._tlrelu8 = nn.LeakyReLU()
        
        self._tconv7 = nn.ConvTranspose2d(192, 192, 3, 1, 1)
        self._tlrelu7 = nn.LeakyReLU()
        
        self._tconv6 = nn.ConvTranspose2d(192, 192, 3, 1, 1)
        self._tlrelu6 = nn.LeakyReLU()

        self._tconv5 = nn.ConvTranspose2d(192, 192, 3, 2, 1, 1)
        self._tlrelu5 = nn.LeakyReLU()

        self._tconv4 = nn.ConvTranspose2d(192, 192, 3, 2, 1, 1)
        self._tlrelu4 = nn.LeakyReLU()

        self._tconv3 = nn.ConvTranspose2d(192, 96, 3, 1, 1)
        self._tlrelu3 = nn.LeakyReLU()
        
        self._tconv2 = nn.ConvTranspose2d(96, 96, 3, 1, 1)
        self._tlrelu2 = nn.LeakyReLU()
        
        self._tconv1 = nn.ConvTranspose2d(96, 3, 3, 1, 1)
        self._tlrelu1 = nn.LeakyReLU()
        
        self.comb0 = Combinator(3)
        self.comb1 = Combinator(96)
        self.comb2 = Combinator(96)
        self.comb3 = Combinator(192)
        self.comb4 = Combinator(192)
        self.comb5 = Combinator(192)
        self.comb6 = Combinator(192)
        self.comb7 = Combinator(192)
        self.comb8 = Combinator(192)
        self.comb9 = Combinator(10)
        
    
    
    def init_buffers(self):
        
        self.clean_encoder_output = []
        self.noisy_encoder_output = []
        self.decoder_output = []
        self.encoder_mean = []
        self.encoder_std = []
        
        return
    
    
    # Function to normalize decoder outputs
    def normalize_decoder_output(self, x, mean, std):
        
        
        x = x - mean.repeat(x.size(0), x.size(2), x.size(3), 1).permute(0,3,1,2)
        x = x / (std.repeat(x.size(0),x.size(2), x.size(3), 1).permute(0,3,1,2) + 1e-5)
        
        return x  
    
       
    def clean_encoder(self, x):
        
        
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm0(x)
        self.clean_encoder_output.append(x.clone().detach())
    
        
        x = self._conv1(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm1(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_1(x)
        x = self._lrelu1(x)
        
        x = self._conv2(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm2(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_2(x)
        x = self._lrelu2(x)
        
        x = self._conv3(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm3(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_3(x)
        x = self._lrelu3(x)
        x = self._mpool3(x)
        
        x = self._conv4(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm4(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_4(x)
        x = self._lrelu4(x)
        
        
        x = self._conv5(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm5(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_5(x)
        x = self._lrelu5(x)
        
        x = self._conv6(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm6(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_6(x)
        x = self._lrelu6(x)
        x = self._mpool6(x)
        
        x = self._conv7(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm7(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_7(x)
        x = self._lrelu7(x)
        
        x = self._conv8(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm8(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_8(x)
        x = self._lrelu8(x)
        
        x = self._conv9(x)
        self.encoder_mean.append(x.mean(dim=0).mean(dim=1).mean(dim=1))
        self.encoder_std.append(x.std(dim=0).mean(dim=1).mean(dim=1))
        x = self._bnorm9(x)
        self.clean_encoder_output.append(x.clone().detach())
        x = self._bnorm_corr_9(x)
        x = self._lrelu9(x)

        
        return x
    
    
    
    def noisy_encoder(self, x):
        
        
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        
        x = self._conv1(x)
        x = self._bnorm1(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_1(x)
        x = self._lrelu1(x)
        
        
        x = self._conv2(x)
        x = self._bnorm2(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_2(x)
        x = self._lrelu2(x)
        
        
        x = self._conv3(x)
        x = self._bnorm3(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_3(x)
        x = self._lrelu3(x)
        x = self._mpool3(x)
        
        
        x = self._conv4(x)
        x = self._bnorm4(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_4(x)
        x = self._lrelu4(x)
        x = self._mpool6(x)
        
        x = self._conv5(x)
        x = self._bnorm5(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_5(x)
        x = self._lrelu5(x)
        
        x = self._conv6(x)
        x = self._bnorm6(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_6(x)
        x = self._lrelu6(x)
        
        x = self._conv7(x)
        x = self._bnorm7(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_7(x)
        x = self._lrelu7(x)
        
        x = self._conv8(x)
        x = self._bnorm8(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_8(x)
        x = self._lrelu8(x)
        
        
        x = self._conv9(x)
        x = self._bnorm9(x)
        x = x + self._noise_factor * torch.randn(x.size()).to(cuda0)
        self.noisy_encoder_output.append(x.clone())
        x = self._bnorm_corr_9(x)
        x = self._lrelu9(x)
        
        return x
    
    
    
    def decoder(self, x):
        
        
        x = self._bnorm9(x)
        
        x = self.comb9.forward(self.noisy_encoder_output[9], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(),
                                    self.encoder_mean[-1], self.encoder_std[-1]))
        
        
        x = self._tconv9(x)
        
        x = self._tlrelu9(x)
        x = self._bnorm8(x)
        
        x = self.comb8.forward(self.noisy_encoder_output[8], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-2], self.encoder_std[-2]))
        
        
        x = self._tconv8(x)
        x = self._tlrelu8(x)
        x = self._bnorm7(x)
        
        x = self.comb7.forward(self.noisy_encoder_output[7], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-3], self.encoder_std[-3]))
        
        
        x = self._tconv7(x)
        x = self._tlrelu7(x)
        x = self._bnorm6(x)
        x = self.comb6.forward(self.noisy_encoder_output[6], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-4], self.encoder_std[-4]))
        
        
        x = self._tconv6(x)
        x = self._tlrelu6(x)
        x = self._bnorm5(x)
        x = self.comb5.forward(self.noisy_encoder_output[5], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-5], self.encoder_std[-5]))
        
        
        x = self._tconv5(x)
        x = self._tlrelu5(x)
        x = self._bnorm4(x)
        x = self.comb4.forward(self.noisy_encoder_output[4], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-6], self.encoder_std[-6]))
        
        
        x = self._tconv4(x)
        x = self._tlrelu4(x)
        x = self._bnorm3(x)
        x = self.comb3.forward(self.noisy_encoder_output[3], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-7], self.encoder_std[-7]))
        
        
        x = self._tconv3(x)
        x = self._tlrelu3(x)
        x = self._bnorm2(x)
        
        x = self.comb2.forward(self.noisy_encoder_output[2], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-8], self.encoder_std[-8]))
        
        
        x = self._tconv2(x)
        x = self._tlrelu2(x)
        x = self._bnorm1(x)
        
        x = self.comb1.forward(self.noisy_encoder_output[1], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-9], self.encoder_std[-9]))
        
        
        x = self._tconv1(x)
        x = self._tlrelu1(x)
        
        x = self.comb0.forward(self.noisy_encoder_output[0], x)
        self.decoder_output.append(self.normalize_decoder_output(x.clone(), 
                                   self.encoder_mean[-10], self.encoder_std[-10]))
        
        
        return x
    
    
    def forward(self, x, phase="Training"):
        
        self.init_buffers()
        
        if phase=="Test":
            
            x = self.clean_encoder(orig_x)
            y_hat = F.log_softmax(x)
            
            return y_hat
        
        orig_x = x.clone()
        _ = self.clean_encoder(orig_x)
        x = self.noisy_encoder(x)

        y_hat = F.log_softmax(x)
        x = self.decoder(x)
        
        
        
        return y_hat
        
        
    def get_layer_values(self):
        
        return self.noisy_encoder_output, self.clean_encoder_output, self.decoder_output
    

Function to train unlabelled data using the MSE loss. This is the reconstruction loss between the decoder and the clean encoder outputs.

In [8]:
def train_unlabelled(model, optimizer, denoising_cost, **kwargs):
    
    
    criterion = nn.MSELoss().to(cuda0)
    
    running_loss = 0
    loss = 0.
    
    for i, (images, labels) in enumerate(kwargs['dataloader']):
        
        
        # Loading data to GPU

        Images = images.to(cuda0)
        Labels = labels.to(cuda0)
        
        y_pred = model(Images)
        noisy_encoder, clean_encoder, decoder = model.get_layer_values()

        for cost, encoder, decoder in zip(denoising_cost, clean_encoder, reversed(decoder)):
            
            loss += cost * criterion(decoder, encoder)

        running_loss += loss
            
        
        if kwargs['phase'] == 'Training':
            #Backward
            optimizer.zero_grad()
            print(f"Iterations: {i}")
            

            loss.backward(retain_graph = True) # Using retain_graph because of multiple forward passes
            

            #Update weights
            optimizer.step()
          
    return running_loss/len(kwargs['dataloader'])

In [9]:
model = LadderNetwork().to(cuda0)
learning_rate = 0.0001    

optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [10]:
train_loss_values = []
train_acc_values = []

least_loss = 100


denoising_cost = [1000, 10, 0.1, 0.1, 0.1]

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience =1)

for epoch in range(25):
    
    train_loss, _ = train_unlabelled(model, optimizer, denoising_cost, dataloader = trainloader,  phase = 'Training', lr_range_val = [], lr_finder = False)  # Training set accuracy
    train_loss_values.append(train_loss)
    
    if train_loss < least_loss:
        least_loss = train_loss
        encoder_checkpoint = copy.deepcopy(encoder.state_dict())
        decoder_checkpoint = copy.deepcopy(decoder.state_dict())
        
    #if epoch % 5 ==0:
    print(f'Epoch: {epoch}   Train Loss: {train_loss.cpu().numpy():.5f} ')
        
    scheduler.step(train_loss)



Iterations: 0
Iterations: 1
Iterations: 2
Iterations: 3
Iterations: 4


RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1524586445097/work/aten/src/THC/generic/THCStorage.cu:58

Training with labelled and data uses a loss function which is a combination of both the reconstruction MSE loss and the cross-entropy loss. The output from the noisy encoder is compared with the true label in the cross-entropy loss and the MSE loss involves the output from the decoder and the clean encoder.

In [None]:
def train(model, optimizer, denoising_cost, **kwargs):
    
    
    criterion_1 = nn.NLLLoss().to(cuda0) # Cost function for labelled data
    criterion_2 = nn.MSELoss().to(cuda0)
    
    running_loss = 0

    
    for i, (images, labels) in enumerate(kwargs['dataloader']):
        
        
        # Loading data to GPU

        Images = images.to(cuda0)
        Labels = labels.to(cuda0)
        
        y_pred = model(Images)
        noisy_encoder, clean_encoder, decoder = model.get_layer_values()
        
        for cost, encoder, decoder in zip(denoising_cost, clean_encoder, reversed(decoder)):
            
            loss_1 += cost * criterion_2(decoder, encoder)
            
        loss_2 = criterion_1(y_pred, Labels)
        
        total_loss = loss_1 + loss_2
        running_loss += loss.data[0]
            
        
        if kwargs['phase'] == 'Training':
            #Backward
            optimizer.zero_grad()
            total_loss.backward()

            #Update weights
            optimizer.step()
          
    return running_loss/len(kwargs['dataloader'])