In [1]:
import numpy as np
import os
import urllib
import random
import numpy as np
import gzip
from collections import defaultdict
import pickle
import argparse

import torch
from torch.autograd import Variable
from torch.nn.parameter import Parameter
from torch.autograd import Variable
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

from __future__ import print_function

import numpy as np
import argparse
import pickle
import os


In [2]:
#!pip install easydict
import easydict
import urllib.request
    
args = easydict.EasyDict({
        "batch": 100,
        "epochs": 5,
        "noise_std": 0.2,
        "data_dir" : "data",
        "seed" : 42,
        "u_costs" : "0.1, 0.1, 0.1, 0.1, 0.1, 10., 1000.",
        "cuda": 1,
        "decay_epoch": 15,
        "num_labelled":100

})

In [3]:
def get_data(filename, directory,
             data_url="http://yann.lecun.com/exdb/mnist/",
             verbose=True):
    if not os.path.exists(directory):
        os.mkdir(directory)
    filepath = os.path.join(directory, filename)
    if not os.path.exists(filepath):
        filepath, _ = urllib.request.urlretrieve(data_url + filename, filepath)
        statinfo = os.stat(filepath)
        if verbose:
            print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
    return filepath


def _read32(bytestream):
    dt = np.dtype(np.uint32).newbyteorder('>')
    return np.frombuffer(bytestream.read(4), dtype=dt)[0]


def extract_images(filename, verbose=True):
    """Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
    if verbose:
        print('Extracting', filename)
    with gzip.open(filename) as bytestream:
        magic = _read32(bytestream)
        if magic != 2051:
            raise ValueError(
              'Invalid magic number %d in MNIST image file: %s' %
              (magic, filename))
        num_images = _read32(bytestream)
        rows = _read32(bytestream)
        cols = _read32(bytestream)
        buf = bytestream.read(rows * cols * num_images)
        data = np.frombuffer(buf, dtype=np.uint8)
        data = data.reshape(num_images, rows, cols, 1)
        return data


def extract_labels(filename, verbose=True):
    """Extract the labels into a 1D uint8 numpy array [index]."""
    if verbose:
        print('Extracting', filename)
    with gzip.open(filename) as bytestream:
        magic = _read32(bytestream)
        if magic != 2049:
            raise ValueError(
              'Invalid magic number %d in MNIST label file: %s' %
              (magic, filename))
        num_items = _read32(bytestream)
        buf = bytestream.read(num_items)
        labels = np.frombuffer(buf, dtype=np.uint8)
        return labels


def shuffle_images_labels(images, labels):
    assert images.shape[0] == labels.shape[0]
    randomize = np.arange(images.shape[0])
    np.random.shuffle(randomize)
    return images[randomize], labels[randomize]


def dump_pickle(filepath, d):
    with open(filepath, "wb") as f:
        pickle.dump(d, f)


def main():

    n_labelled = args.num_labelled
    random.seed(42)
    np.random.seed(42)
    data_dir = "data/"
    mnist_train_images_gz = 'train-images-idx3-ubyte.gz'
    mnist_train_labels_gz = 'train-labels-idx1-ubyte.gz'
    mnist_test_images_gz = 't10k-images-idx3-ubyte.gz'
    mnist_test_labels_gz = 't10k-labels-idx1-ubyte.gz'

    mnist_train_images = get_data(mnist_train_images_gz, data_dir)
    mnist_train_images = extract_images(mnist_train_images)
    mnist_train_labels = get_data(mnist_train_labels_gz, data_dir)
    mnist_train_labels = extract_labels(mnist_train_labels)
    mnist_test_images = get_data(mnist_test_images_gz, data_dir)
    mnist_test_images = extract_images(mnist_test_images)
    mnist_test_labels = get_data(mnist_test_labels_gz, data_dir)
    mnist_test_labels = extract_labels(mnist_test_labels)

    train_data_shuffle = [(x, y) for x, y in zip(mnist_train_images, mnist_train_labels)]
    random.shuffle(train_data_shuffle)
    mnist_shuffled_train_images = np.array([x[0] for x in train_data_shuffle])
    mnist_shuffled_train_labels = np.array([x[1] for x in train_data_shuffle])

    validation_size = 10000
    train_size = mnist_train_images.shape[0] - validation_size

    train_images = mnist_shuffled_train_images[:train_size].copy()
    train_labels = mnist_shuffled_train_labels[:train_size].copy()

    validation_images = mnist_shuffled_train_images[train_size:].copy()
    validation_labels = mnist_shuffled_train_labels[train_size:].copy()

    test_images = mnist_test_images
    test_labels = mnist_test_labels

    train_data_label_buckets = defaultdict(list)

    for image, label in zip(train_images, train_labels):
        train_data_label_buckets[label].append((image, label))

    num_labels = len(train_data_label_buckets)

    train_labelled_data_images = []
    train_labelled_data_labels = []
    train_unlabelled_data_images = []
    train_unlabelled_data_labels = []

    for label, label_data in train_data_label_buckets.items():
        count = int(n_labelled / num_labels)
        for v in label_data[:count]:
            train_labelled_data_images.append(v[0])
            train_labelled_data_labels.append(v[1])
        for v in label_data[count:]:
            train_unlabelled_data_images.append(v[0])
            # dummy label
            train_unlabelled_data_labels.append(-1)

    train_labelled_images = np.array(train_labelled_data_images)
    train_labelled_labels = np.array(train_labelled_data_labels)

    train_unlabelled_images = np.array(train_unlabelled_data_images)
    train_unlabelled_labels = np.array(train_unlabelled_data_labels)

    train_labelled_images = train_labelled_images[:, :, :, 0]
    train_unlabelled_images = train_unlabelled_images[:, :, :, 0]
    validation_images = validation_images[:, :, :, 0]
    test_images = test_images[:, :, :, 0]

    train_labelled_images, train_labelled_labels = shuffle_images_labels(train_labelled_images, train_labelled_labels)

    # normalizing
    train_labelled_images = np.multiply(train_labelled_images, 1./255.)
    train_unlabelled_images = np.multiply(train_unlabelled_images, 1./255.)
    validation_images = np.multiply(validation_images, 1./255.)
    test_images = np.multiply(test_images, 1./255,)

    print("=" * 50)
    print("train_labelled_images shape:", train_labelled_images.shape)
    print("train_labelled_labels shape:", train_labelled_labels.shape)
    print()
    print("train_unlabelled_images shape:", train_unlabelled_images.shape)
    print("train_unlabelled_labels shape:", train_unlabelled_labels.shape)
    print()
    print("validation_images shape:", validation_images.shape)
    print("validation_labels shape:", validation_labels.shape)
    print()
    print("test_images shape:", test_images.shape)
    print("test_labels shape:", test_labels.shape)
    print("=" * 50)

    print("Dumping pickles")

    dump_pickle(data_dir + "train_labelled_images.p", train_labelled_images)
    dump_pickle(data_dir + "train_labelled_labels.p", train_labelled_labels)
    dump_pickle(data_dir + "train_unlabelled_images.p", train_unlabelled_images)
    dump_pickle(data_dir + "train_unlabelled_labels.p", train_unlabelled_labels)
    dump_pickle(data_dir + "validation_images.p", validation_images)
    dump_pickle(data_dir + "validation_labels.p", validation_labels)
    dump_pickle(data_dir + "test_images.p", test_images)
    dump_pickle(data_dir + "test_labels.p", test_labels)

    print("MNIST dataset successfully created")


if __name__ == "__main__":
    main()

Succesfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting data/train-images-idx3-ubyte.gz
Succesfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting data/train-labels-idx1-ubyte.gz
Succesfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting data/t10k-images-idx3-ubyte.gz
Succesfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting data/t10k-labels-idx1-ubyte.gz
train_labelled_images shape: (100, 28, 28)
train_labelled_labels shape: (100,)

train_unlabelled_images shape: (49900, 28, 28)
train_unlabelled_labels shape: (49900,)

validation_images shape: (10000, 28, 28)
validation_labels shape: (10000,)

test_images shape: (10000, 28, 28)
test_labels shape: (10000,)
Dumping pickles
MNIST dataset successfully created


In [4]:
class Encoder(torch.nn.Module):
    def __init__(self, d_in, d_out, activation_type,
                 train_bn_scaling, noise_level, use_cuda):
        super(Encoder, self).__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.activation_type = activation_type
        self.train_bn_scaling = train_bn_scaling
        self.noise_level = noise_level
        self.use_cuda = use_cuda

        # Encoder
        self.linear = torch.nn.Linear(d_in, d_out, bias=False)
        self.linear.weight.data = torch.randn(self.linear.weight.data.size()) / np.sqrt(d_in)


        self.bn_normalize_clean = torch.nn.BatchNorm1d(d_out, affine=False)
        self.bn_normalize = torch.nn.BatchNorm1d(d_out, affine=False)
        if self.use_cuda:
            self.bn_beta = Parameter(torch.cuda.FloatTensor(1, d_out))
        else:
            self.bn_beta = Parameter(torch.FloatTensor(1, d_out))
        self.bn_beta.data.zero_()
        if self.train_bn_scaling:
            # batch-normalization scaling
            if self.use_cuda:
                self.bn_gamma = Parameter(torch.cuda.FloatTensor(1, d_out))
                self.bn_gamma.data = torch.ones(self.bn_gamma.size()).cuda()
            else:
                self.bn_gamma = Parameter(torch.FloatTensor(1, d_out))
                self.bn_gamma.data = torch.ones(self.bn_gamma.size())

        # Activation
        if activation_type == 'relu':
            self.activation = torch.nn.ReLU()
        elif activation_type == 'softmax':
            self.activation = torch.nn.Softmax()
        else:
            raise ValueError("invalid Acitvation type")

        # buffer for z_pre, z which will be used in decoder cost
        self.buffer_z_pre = None
        self.buffer_z = None
        # buffer for tilde_z which will be used by decoder for reconstruction
        self.buffer_tilde_z = None

    def bn_gamma_beta(self, x):
        if self.use_cuda:
            ones = Parameter(torch.ones(x.size()[0], 1).cuda())
        else:
            ones = Parameter(torch.ones(x.size()[0], 1))
        t = x + ones.mm(self.bn_beta)
        if self.train_bn_scaling:
            t = torch.mul(t, ones.mm(self.bn_gamma))
        return t

    def forward_clean(self, h):
        z_pre = self.linear(h)
  
        self.buffer_z_pre = z_pre.detach().clone()
        z = self.bn_normalize_clean(z_pre)
        self.buffer_z = z.detach().clone()
        z_gb = self.bn_gamma_beta(z)
        h = self.activation(z_gb)
        return h

    def forward_noise(self, tilde_h):
     
        z_pre = self.linear(tilde_h)
        z_pre_norm = self.bn_normalize(z_pre)
        # Add noise
        noise = np.random.normal(loc=0.0, scale=self.noise_level, size=z_pre_norm.size())
        if self.use_cuda:
            noise = Variable(torch.cuda.FloatTensor(noise))
        else:
            noise = Variable(torch.FloatTensor(noise))
   
        tilde_z = z_pre_norm + noise

        self.buffer_tilde_z = tilde_z
        z = self.bn_gamma_beta(tilde_z)
        h = self.activation(z)
        return h

In [111]:
class StackedEncoders(torch.nn.Module):
    def __init__(self, d_in, d_encoders, activation_types,
                 train_batch_norms, noise_std, use_cuda):
        super(StackedEncoders, self).__init__()
        self.buffer_tilde_z_bottom = None
        self.encoders_ref = []
        self.encoders = torch.nn.Sequential()
        self.noise_level = noise_std
        self.use_cuda = use_cuda
        n_encoders = len(d_encoders)
        for i in range(n_encoders):
            if i == 0:
                d_input = d_in
            else:
                d_input = d_encoders[i - 1]
            d_output = d_encoders[i]
            activation = activation_types[i]
            train_batch_norm = train_batch_norms[i]
            encoder_ref = "encoder_" + str(i)
            encoder = Encoder(d_input, d_output, activation, train_batch_norm, noise_std, use_cuda)
            self.encoders_ref.append(encoder_ref)
            self.encoders.add_module(encoder_ref, encoder)

    def forward_clean(self, x):
        h = x
        for e_ref in self.encoders_ref:
            encoder = getattr(self.encoders, e_ref)
            h = encoder.forward_clean(h)
        return h

    def forward_noise(self, x):
        noise = np.random.normal(loc=0.0, scale=self.noise_level, size=x.size())
        if self.use_cuda:
            noise = Variable(torch.cuda.FloatTensor(noise))
        else:
            noise = Variable(torch.FloatTensor(noise))
        h = x + noise
        self.buffer_tilde_z_bottom = h.clone()
        # pass through encoders
        for e_ref in self.encoders_ref:
            encoder = getattr(self.encoders, e_ref)
            h = encoder.forward_noise(h)
        return h

    def get_encoders_tilde_z(self, reverse=True):
        tilde_z_layers = []
        for e_ref in self.encoders_ref:
            encoder = getattr(self.encoders, e_ref)
            tilde_z = encoder.buffer_tilde_z.clone()
            tilde_z_layers.append(tilde_z)
        if reverse:
            tilde_z_layers.reverse()
        return tilde_z_layers

    def get_encoders_z_pre(self, reverse=True):
        z_pre_layers = []
        for e_ref in self.encoders_ref:
            encoder = getattr(self.encoders, e_ref)
            z_pre = encoder.buffer_z_pre.clone()
            z_pre_layers.append(z_pre)
        if reverse:
            z_pre_layers.reverse()
        return z_pre_layers

    def get_encoders_z(self, reverse=True):
        z_layers = []
        for e_ref in self.encoders_ref:
            encoder = getattr(self.encoders, e_ref)
            z = encoder.buffer_z.clone()
            z_layers.append(z)
        if reverse:
            z_layers.reverse()
        return z_layers

In [5]:
class Decoder(torch.nn.Module):
    def __init__(self, d_in, d_out, use_cuda):
        super(Decoder, self).__init__()

        self.d_in = d_in
        self.d_out = d_out
        self.use_cuda = use_cuda

        if self.use_cuda:
            self.a1 = Parameter(0. * torch.ones(1, d_in).cuda())
            self.a2 = Parameter(1. * torch.ones(1, d_in).cuda())
            self.a3 = Parameter(0. * torch.ones(1, d_in).cuda())
            self.a4 = Parameter(0. * torch.ones(1, d_in).cuda())
            self.a5 = Parameter(0. * torch.ones(1, d_in).cuda())

            self.a6 = Parameter(0. * torch.ones(1, d_in).cuda())
            self.a7 = Parameter(1. * torch.ones(1, d_in).cuda())
            self.a8 = Parameter(0. * torch.ones(1, d_in).cuda())
            self.a9 = Parameter(0. * torch.ones(1, d_in).cuda())
            self.a10 = Parameter(0. * torch.ones(1, d_in).cuda())
        else:
            self.a1 = Parameter(0. * torch.ones(1, d_in))
            self.a2 = Parameter(1. * torch.ones(1, d_in))
            self.a3 = Parameter(0. * torch.ones(1, d_in))
            self.a4 = Parameter(0. * torch.ones(1, d_in))
            self.a5 = Parameter(0. * torch.ones(1, d_in))

            self.a6 = Parameter(0. * torch.ones(1, d_in))
            self.a7 = Parameter(1. * torch.ones(1, d_in))
            self.a8 = Parameter(0. * torch.ones(1, d_in))
            self.a9 = Parameter(0. * torch.ones(1, d_in))
            self.a10 = Parameter(0. * torch.ones(1, d_in))


        if self.d_out is not None:
            self.V = torch.nn.Linear(d_in, d_out, bias=False)
            self.V.weight.data = torch.randn(self.V.weight.data.size()) / np.sqrt(d_in)
            # batch-normalization for u
            self.bn_normalize = torch.nn.BatchNorm1d(d_out, affine=False)

        # buffer for hat_z_l to be used for cost calculation
        self.buffer_hat_z_l = None

    def g(self, tilde_z_l, u_l):
        if self.use_cuda:
            ones = Parameter(torch.ones(tilde_z_l.size()[0], 1).cuda())
        else:
            ones = Parameter(torch.ones(tilde_z_l.size()[0], 1))

        b_a1 = ones.mm(self.a1)
        b_a2 = ones.mm(self.a2)
        b_a3 = ones.mm(self.a3)
        b_a4 = ones.mm(self.a4)
        b_a5 = ones.mm(self.a5)

        b_a6 = ones.mm(self.a6)
        b_a7 = ones.mm(self.a7)
        b_a8 = ones.mm(self.a8)
        b_a9 = ones.mm(self.a9)
        b_a10 = ones.mm(self.a10)

        mu_l = torch.mul(b_a1, torch.sigmoid(torch.mul(b_a2, u_l) + b_a3)) + \
               torch.mul(b_a4, u_l) + \
               b_a5

        v_l = torch.mul(b_a6, torch.sigmoid(torch.mul(b_a7, u_l) + b_a8)) + \
              torch.mul(b_a9, u_l) + \
              b_a10

        hat_z_l = torch.mul(tilde_z_l - mu_l, v_l) + mu_l

        return hat_z_l

    def forward(self, tilde_z_l, u_l):

        hat_z_l = self.g(tilde_z_l, u_l)

        self.buffer_hat_z_l = hat_z_l

        if self.d_out is not None:
            t = self.V.forward(hat_z_l)
            u_l_below = self.bn_normalize(t)
            return u_l_below
        else:
            return None


class StackedDecoders(torch.nn.Module):
    def __init__(self, d_in, d_decoders, image_size, use_cuda):
        super(StackedDecoders, self).__init__()
        self.bn_u_top = torch.nn.BatchNorm1d(d_in, affine=False)
        self.decoders_ref = []
        self.decoders = torch.nn.Sequential()
        self.use_cuda = use_cuda
        n_decoders = len(d_decoders)
        for i in range(n_decoders):
            if i == 0:
                d_input = d_in
            else:
                d_input = d_decoders[i - 1]
            d_output = d_decoders[i]
            decoder_ref = "decoder_" + str(i)
            decoder = Decoder(d_input, d_output, use_cuda)
            self.decoders_ref.append(decoder_ref)
            self.decoders.add_module(decoder_ref, decoder)

        self.bottom_decoder = Decoder(image_size, None, use_cuda)

    def forward(self, tilde_z_layers, u_top, tilde_z_bottom):
        # Note that tilde_z_layers should be in reversed order of encoders
        hat_z = []
        u = self.bn_u_top(u_top)
        for i in range(len(self.decoders_ref)):
            d_ref = self.decoders_ref[i]
            decoder = getattr(self.decoders, d_ref)
            tilde_z = tilde_z_layers[i]
            u = decoder.forward(tilde_z, u)
            hat_z.append(decoder.buffer_hat_z_l)
        self.bottom_decoder.forward(tilde_z_bottom, u)
        hat_z_bottom = self.bottom_decoder.buffer_hat_z_l.clone()
        hat_z.append(hat_z_bottom)
        return hat_z

    def bn_hat_z_layers(self, hat_z_layers, z_pre_layers):
        # TODO: Calculate batchnorm using GPU Tensors.
        assert len(hat_z_layers) == len(z_pre_layers)
        hat_z_layers_normalized = []
        for i, (hat_z, z_pre) in enumerate(zip(hat_z_layers, z_pre_layers)):
            if self.use_cuda:
                ones = Variable(torch.ones(z_pre.size()[0], 1).cuda())
            else:
                ones = Variable(torch.ones(z_pre.size()[0], 1))
                #print('onoononononononononoononononononononoononononononononoononononononononoononononononon')
                #print(ones.shape)
            mean = torch.mean(z_pre, 0)
            noise_var = np.random.normal(loc=0.0, scale=1 - 1e-10, size=z_pre.size())
            if self.use_cuda:
                var = np.var(z_pre.data.cpu().numpy() + noise_var, axis=0).reshape(1, z_pre.size()[1])
            else:
                var = np.var(z_pre.data.numpy() + noise_var, axis=0).reshape(1, z_pre.size()[1])
            var = Variable(torch.FloatTensor(var))
            #print('onononononon222222222222222222222222222222222222')
            if self.use_cuda:
                hat_z = hat_z.cpu()
                ones = ones.cpu()
                mean = mean.cpu()
            #print('hat_zzzzz.shpae',hat_z.shape)
            #print('mean_shape',mean.shape)
            #print('torch.sqrt(var).shape',torch.sqrt(var + 1e-10).shape)
            #print(mean)
            #print(var)
            hat_z_normalized = torch.div(hat_z - ones.mm(mean.unsqueeze(0)), ones.mm(torch.sqrt(var + 1e-10)))
            #print('onononononon2222222222222222222222222222222222223333333333333333333333333333333')
            if self.use_cuda:
                hat_z_normalized = hat_z_normalized.cuda()
            hat_z_layers_normalized.append(hat_z_normalized)
        return hat_z_layers_normalized

In [6]:
class Ladder(torch.nn.Module):
    def __init__(self, encoder_sizes, decoder_sizes, encoder_activations,
                 encoder_train_bn_scaling, noise_std, use_cuda):
        super(Ladder, self).__init__()
        self.use_cuda = use_cuda
        decoder_in = encoder_sizes[-1]
        encoder_in = decoder_sizes[-1]
        self.se = StackedEncoders(encoder_in, encoder_sizes, encoder_activations,
                                  encoder_train_bn_scaling, noise_std, use_cuda)
        self.de = StackedDecoders(decoder_in, decoder_sizes, encoder_in, use_cuda)
        self.bn_image = torch.nn.BatchNorm1d(encoder_in, affine=False)

    def forward_encoders_clean(self, data):
        return self.se.forward_clean(data)

    def forward_encoders_noise(self, data):
        return self.se.forward_noise(data)

    def forward_decoders(self, tilde_z_layers, encoder_output, tilde_z_bottom):
        return self.de.forward(tilde_z_layers, encoder_output, tilde_z_bottom)

    def get_encoders_tilde_z(self, reverse=True):
        return self.se.get_encoders_tilde_z(reverse)

    def get_encoders_z_pre(self, reverse=True):
        return self.se.get_encoders_z_pre(reverse)

    def get_encoder_tilde_z_bottom(self):
        return self.se.buffer_tilde_z_bottom.clone()

    def get_encoders_z(self, reverse=True):
        return self.se.get_encoders_z(reverse)

    def decoder_bn_hat_z_layers(self, hat_z_layers, z_pre_layers):
        return self.de.bn_hat_z_layers(hat_z_layers, z_pre_layers)

In [7]:
def evaluate_performance(ladder, valid_loader, e, agg_cost_scaled, agg_supervised_cost_scaled,
                         agg_unsupervised_cost_scaled, args):
    correct = 0.
    total = 0.
    for batch_idx, (data, target) in enumerate(valid_loader):
        if args.cuda:
            data = data.cuda()
        data, target = Variable(data), Variable(target)
        output = ladder.forward_encoders_clean(data)
        # TODO: Do away with the below hack for GPU tensors.
        if args.cuda:
            output = output.cpu()
            target = target.cpu()
        output = output.data.numpy()
        preds = np.argmax(output, axis=1)
        target = target.data.numpy()
        correct += np.sum(target == preds)
        total += target.shape[0]

    print("Epoch:", e + 1, "\t",
          "Total Cost:", "{:.4f}".format(agg_cost_scaled), "\t",
          "Supervised Cost:", "{:.4f}".format(agg_supervised_cost_scaled), "\t",
          "Unsupervised Cost:", "{:.4f}".format(agg_unsupervised_cost_scaled), "\t",
          "Validation Accuracy:", correct / total)

In [8]:
def main():
    # command line arguments

    batch_size = args.batch
    epochs = args.epochs
    noise_std = args.noise_std
    seed = args.seed
    decay_epoch = args.decay_epoch
    if args.cuda and not torch.cuda.is_available():
        print("WARNING: torch.cuda not available, using CPU.\n")
        args.cuda = False

    print("=====================")
    print("BATCH SIZE:", batch_size)
    print("EPOCHS:", epochs)
    print("RANDOM SEED:", args.seed)
    print("NOISE STD:", noise_std)
    print("LR DECAY EPOCH:", decay_epoch)
    print("CUDA:", args.cuda)
    print("=====================\n")

    np.random.seed(seed)
    torch.manual_seed(seed)
    if args.cuda:
        torch.cuda.manual_seed(seed)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}

    train_labelled_images_filename = os.path.join(args.data_dir, "train_labelled_images.p")
    train_labelled_labels_filename = os.path.join(args.data_dir, "train_labelled_labels.p")
    train_unlabelled_images_filename = os.path.join(args.data_dir, "train_unlabelled_images.p")
    train_unlabelled_labels_filename = os.path.join(args.data_dir, "train_unlabelled_labels.p")
    validation_images_filename = os.path.join(args.data_dir, "validation_images.p")
    validation_labels_filename = os.path.join(args.data_dir, "validation_labels.p")

    print("Loading Data")
    with open(train_labelled_images_filename,'rb') as f:
        train_labelled_images = pickle.load(f)
    train_labelled_images = train_labelled_images.reshape(train_labelled_images.shape[0], 784)
    with open(train_labelled_labels_filename,'rb') as f:
        train_labelled_labels = pickle.load(f).astype(int)
    with open(train_unlabelled_images_filename,'rb') as f:
        train_unlabelled_images = pickle.load(f)
    train_unlabelled_images = train_unlabelled_images.reshape(train_unlabelled_images.shape[0], 784)
    with open(train_unlabelled_labels_filename,'rb') as f:
        train_unlabelled_labels = pickle.load(f).astype(int)
    with open(validation_images_filename,'rb') as f:
        validation_images = pickle.load(f)
    validation_images = validation_images.reshape(validation_images.shape[0], 784)
    with open(validation_labels_filename,'rb') as f:
        validation_labels = pickle.load(f).astype(int)

    # Create DataLoaders
    unlabelled_dataset = TensorDataset(torch.FloatTensor(train_unlabelled_images), torch.LongTensor(train_unlabelled_labels))
    unlabelled_loader = DataLoader(unlabelled_dataset, batch_size=batch_size, shuffle=True, **kwargs)
    validation_dataset = TensorDataset(torch.FloatTensor(validation_images), torch.LongTensor(validation_labels))
    validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, **kwargs)

    # Configure the Ladder
    starter_lr = 0.02
    encoder_sizes = [1000, 500, 250, 250, 250, 10]
    decoder_sizes = [250, 250, 250, 500, 1000, 784]
    unsupervised_costs_lambda = [float(x) for x in args.u_costs.split(",")]
    encoder_activations = ["relu", "relu", "relu", "relu", "relu", "softmax"]
    encoder_train_bn_scaling = [False, False, False, False, False, True]
    ladder = Ladder(encoder_sizes, decoder_sizes, encoder_activations,
                    encoder_train_bn_scaling, noise_std, args.cuda)
    optimizer = Adam(ladder.parameters(), lr=starter_lr)
    loss_supervised = torch.nn.CrossEntropyLoss()
    loss_unsupervised = torch.nn.MSELoss()

    if args.cuda:
        ladder.cuda()

    assert len(unsupervised_costs_lambda) == len(decoder_sizes) + 1
    assert len(encoder_sizes) == len(decoder_sizes)

    print("")
    print("========NETWORK=======")
    print(ladder)
    print("======================")

    print("")
    print("==UNSUPERVISED-COSTS==")
    print(unsupervised_costs_lambda)

    print("")
    print("=====================")
    print("TRAINING\n")

    # TODO: Add annealing of learning rate after 100 epochs

    for e in range(epochs):
        agg_cost = 0.
        agg_supervised_cost = 0.
        agg_unsupervised_cost = 0.
        num_batches = 0
        ladder.train()
        # TODO: Add volatile for the input parameters in training and validation
        ind_labelled = 0
        ind_limit = np.ceil(float(train_labelled_images.shape[0]) / batch_size)

        if e > args.decay_epoch:
            ratio = float(epochs - e) / (epochs - decay_epoch)
            current_lr = starter_lr * ratio
            optimizer = Adam(ladder.parameters(), lr=current_lr)


        for batch_idx, (unlabelled_images, unlabelled_labels) in enumerate(unlabelled_loader):
            if ind_labelled == ind_limit:
                randomize = np.arange(train_labelled_images.shape[0])
                np.random.shuffle(randomize)
                train_labelled_images = train_labelled_images[randomize]
                train_labelled_labels = train_labelled_labels[randomize]
                ind_labelled = 0

            # TODO: Verify whether labelled examples are used for calculating unsupervised loss.

            labelled_start = batch_size * ind_labelled
            labelled_end = batch_size * (ind_labelled + 1)
            ind_labelled += 1
            batch_train_labelled_images = torch.FloatTensor(train_labelled_images[labelled_start:labelled_end])
            batch_train_labelled_labels = torch.LongTensor(train_labelled_labels[labelled_start:labelled_end])

            if args.cuda:
                batch_train_labelled_images = batch_train_labelled_images.cuda()
                batch_train_labelled_labels = batch_train_labelled_labels.cuda()
                unlabelled_images = unlabelled_images.cuda()

            labelled_data = Variable(batch_train_labelled_images, requires_grad=False)
            labelled_target = Variable(batch_train_labelled_labels, requires_grad=False)
            unlabelled_data = Variable(unlabelled_images)

            optimizer.zero_grad()

            # do a noisy pass for labelled data
            output_noise_labelled = ladder.forward_encoders_noise(labelled_data)

            # do a noisy pass for unlabelled_data
            output_noise_unlabelled = ladder.forward_encoders_noise(unlabelled_data)
            tilde_z_layers_unlabelled = ladder.get_encoders_tilde_z(reverse=True)

            # do a clean pass for unlabelled data
            output_clean_unlabelled = ladder.forward_encoders_clean(unlabelled_data)
            z_pre_layers_unlabelled = ladder.get_encoders_z_pre(reverse=True)
            z_layers_unlabelled = ladder.get_encoders_z(reverse=True)

            tilde_z_bottom_unlabelled = ladder.get_encoder_tilde_z_bottom()

            # pass through decoders
            hat_z_layers_unlabelled = ladder.forward_decoders(tilde_z_layers_unlabelled,
                                                              output_noise_unlabelled,
                                                              tilde_z_bottom_unlabelled)

            z_pre_layers_unlabelled.append(unlabelled_data)
            z_layers_unlabelled.append(unlabelled_data)


            bn_hat_z_layers_unlabelled = ladder.decoder_bn_hat_z_layers(hat_z_layers_unlabelled, z_pre_layers_unlabelled)

            # calculate costs
            cost_supervised = loss_supervised.forward(output_noise_labelled, labelled_target)
            cost_unsupervised = 0.
            assert len(z_layers_unlabelled) == len(bn_hat_z_layers_unlabelled)
            for cost_lambda, z, bn_hat_z in zip(unsupervised_costs_lambda, z_layers_unlabelled, bn_hat_z_layers_unlabelled):
                c = cost_lambda * loss_unsupervised.forward(bn_hat_z, z)
                cost_unsupervised += c

            # backprop
            cost = cost_supervised + cost_unsupervised
            cost.backward()
            optimizer.step()
            
            #print(cost.data.item)
            #print(cost.data[0])
            agg_cost += cost.data.item()
            agg_supervised_cost += cost_supervised.data.item()
            agg_unsupervised_cost += cost_unsupervised.data.item()
            num_batches += 1

            if ind_labelled == ind_limit:
                # Evaluation
                ladder.eval()
                evaluate_performance(ladder, validation_loader, e,
                                     agg_cost / num_batches,
                                     agg_supervised_cost / num_batches,
                                     agg_unsupervised_cost / num_batches,
                                     args)
                ladder.train()
    print("=====================\n")
    print("Done :)")





In [116]:
if __name__ == "__main__":
    main()

BATCH SIZE: 100
EPOCHS: 5
RANDOM SEED: 42
NOISE STD: 0.2
LR DECAY EPOCH: 15
CUDA: False

Loading Data

Ladder(
  (se): StackedEncoders(
    (encoders): Sequential(
      (encoder_0): Encoder(
        (linear): Linear(in_features=784, out_features=1000, bias=False)
        (bn_normalize_clean): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        (bn_normalize): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        (activation): ReLU()
      )
      (encoder_1): Encoder(
        (linear): Linear(in_features=1000, out_features=500, bias=False)
        (bn_normalize_clean): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        (bn_normalize): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
        (activation): ReLU()
      )
      (encoder_2): Encoder(
        (linear): Linear(in_features=500, out_features=250, bias=False)
        (bn_normalize_

  h = self.activation(z)
  h = self.activation(z_gb)


Epoch: 1 	 Total Cost: 243.0751 	 Supervised Cost: 2.3188 	 Unsupervised Cost: 240.7563 	 Validation Accuracy: 0.1266
Epoch: 1 	 Total Cost: 233.7150 	 Supervised Cost: 2.2664 	 Unsupervised Cost: 231.4485 	 Validation Accuracy: 0.1971
Epoch: 1 	 Total Cost: 224.3816 	 Supervised Cost: 2.1841 	 Unsupervised Cost: 222.1974 	 Validation Accuracy: 0.354
Epoch: 1 	 Total Cost: 213.5557 	 Supervised Cost: 2.1202 	 Unsupervised Cost: 211.4355 	 Validation Accuracy: 0.4809
Epoch: 1 	 Total Cost: 204.5826 	 Supervised Cost: 2.0646 	 Unsupervised Cost: 202.5180 	 Validation Accuracy: 0.4455
Epoch: 1 	 Total Cost: 193.3409 	 Supervised Cost: 2.0235 	 Unsupervised Cost: 191.3173 	 Validation Accuracy: 0.3997
Epoch: 1 	 Total Cost: 184.1274 	 Supervised Cost: 1.9879 	 Unsupervised Cost: 182.1395 	 Validation Accuracy: 0.4436
Epoch: 1 	 Total Cost: 175.6053 	 Supervised Cost: 1.9559 	 Unsupervised Cost: 173.6494 	 Validation Accuracy: 0.4948
Epoch: 1 	 Total Cost: 167.3514 	 Supervised Cost: 1.9284