## Imports

In [15]:
from __future__ import division, print_function, absolute_import
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset

## Helper functions
Helper functions borrowed from original paper by Li et al. 

In [16]:
def makedirs(path):
    '''
    if path does not exist in the file system, create it
    '''
    if not os.path.exists(path):
        os.makedirs(path)

def list_of_distances(X, Y):
    '''
    Given a list of vectors, X = [x_1, ..., x_n], and another list of vectors,
    Y = [y_1, ... , y_m], we return a list of vectors
            [[d(x_1, y_1), d(x_1, y_2), ... , d(x_1, y_m)],
             ...
             [d(x_n, y_1), d(x_n, y_2), ... , d(x_n, y_m)]],
    where the distance metric used is the sqared euclidean distance.
    The computation is achieved through a clever use of broadcasting.
    '''
    XX = torch.reshape(list_of_norms(X), shape=(-1, 1))
    YY = torch.reshape(list_of_norms(Y), shape=(1, -1))
    output = XX + YY - 2 * torch.mm(X, torch.transpose(Y))

    return output

def list_of_norms(X):
    '''
    X is a list of vectors X = [x_1, ..., x_n], we return
        [d(x_1, x_1), d(x_2, x_2), ... , d(x_n, x_n)], where the distance
    function is the squared euclidean distance.
    '''
    return tf.reduce_sum(np.pow(X, 2), axis=1)

def print_and_write(str, file):
    '''
    print str to the console and also write it to file
    '''
    print(str)
    file.write(str + '\n')

## Create necessary folders

In [17]:
# data folder
makedirs('./data/mnist')

# Models folder
model_folder = os.path.join(os.getcwd(), "saved_model", "mnist_model", "mnist_cae_1")
makedirs(model_folder)

# Image folder
img_folder = os.path.join(model_folder, "img")
makedirs(img_folder)

# Model filename
model_filename = "mnist_cae"

## Dataset - Pytorch
#### <font color='red'>Double check the normalization mean and stdev for dataset</font>
#### <font color='red'>Double check parameters Dataloader (e.g. shuffle on or off, different batch sizes for train/valid/test)</font>

In [23]:
# Transforms to perform on loaded dataset. Normalize around mean 0.1307 and std 0.3081 for optimal pytorch results. 
# source: https://discuss.pytorch.org/t/normalization-in-the-mnist-example/457/4
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.1307,),(0.3081,))])

# Load datasets into reproduction/data/mnist. Download if data not present. 
mnist_train = DataLoader(torchvision.datasets.MNIST('./data/mnist', train=True, download=True, transform=transforms))

mnist_train_data = mnist_train.dataset.data
mnist_train_targets = mnist_train.dataset.targets

# first 55000 examples for training
x_train = mnist_train_data[0:55000]
y_train = mnist_train_targets[0:55000]

# 5000 examples for validation set
x_valid = mnist_train_data[55000:60000]
y_valid = mnist_train_targets[55000:60000]

# 10000 examples in test set

mnist_test = DataLoader(torchvision.datasets.MNIST('./data/mnist', train=False, download=True, transform=transforms))

x_test = mnist_test.dataset.data
y_test = mnist_test.dataset.targets

train_data = TensorDataset(x_train, y_train)
valid_data = TensorDataset(x_valid, y_valid)
test_data = TensorDataset(x_test, y_test)

batch_size = 250

# Datasets in DataLoader, can be used in iteration: for (x, y) in train_dl...
# Check parameters
train_dl = DataLoader(train_data, batch_size=batch_size, drop_last=False, shuffle=False)
valid_dl = DataLoader(valid_data, batch_size=batch_size, drop_last=False, shuffle=False)
test_dl = DataLoader(test_data, batch_size=batch_size, drop_last=False, shuffle=False)

## Parameters

In [4]:
# COPIED FROM THE ORIGINAL IMPLEMENTATION
# training parameters
learning_rate = 0.002
training_epochs = 1500

# frequency of testing and saving
test_display_step = 100   # how many epochs we do evaluate on the test set once
save_step = 50            # how frequently do we save the model to disk

# elastic deformation parameters
sigma = 4
alpha = 20

# lambda's are the ratios between the four error terms
lambda_class = 20
lambda_ae = 1 # autoencoder
lambda_1 = 1 # push prototype vectors to have meaningful decodings in pixel space
lambda_2 = 1 # cluster training examples around prototypes in latent space


input_height = input_width =  28    # MNIST data input shape 
n_input_channel = 1     # the number of color channels; for MNIST is 1.
input_size = input_height * input_width * n_input_channel   # 784
n_classes = 10

# Network Parameters
n_prototypes = 15         # the number of prototypes
n_layers = 4

# height and width of each layers' filters
f_1 = 3
f_2 = 3
f_3 = 3
f_4 = 3

# stride size in each direction for each of the layers
s_1 = 2
s_2 = 2
s_3 = 2
s_4 = 2

# number of feature maps in each layer
n_map_1 = 32
n_map_2 = 32
n_map_3 = 32
n_map_4 = 10

# the shapes of each layer's filter
# [out channel, in_channel, 3, 3]
filter_shape_1 = [n_map_1, n_input_channel, f_1, f_1]
filter_shape_2 = [n_map_2, n_map_1, f_2, f_2]
filter_shape_3 = [n_map_3, n_map_2, f_3, f_3]
filter_shape_4 = [n_map_4, n_map_3, f_4, f_4]

# strides for each layer (changed to tuples)
stride_1 = [s_1, s_1]
stride_2 = [s_2, s_2]
stride_3 = [s_3, s_3]
stride_4 = [s_4, s_4]


## Initialize encoder and decoder

In [5]:
std_weights = 0.01

weights = {
    'enc_f1': nn.Parameter(std_weights * torch.randn(filter_shape_1,
                                           dtype=torch.float32)),
    'enc_f2': nn.Parameter(std_weights * torch.randn(filter_shape_2,
                                           dtype=torch.float32)), 
    'enc_f3': nn.Parameter(std_weights * torch.randn(filter_shape_3,
                                           dtype=torch.float32)), 
    'enc_f4': nn.Parameter(std_weights * torch.randn(filter_shape_4,
                                           dtype=torch.float32)), 
    'dec_f4': nn.Parameter(std_weights * torch.randn(filter_shape_4,
                                           dtype=torch.float32)), 
    'dec_f3': nn.Parameter(std_weights * torch.randn(filter_shape_3,
                                           dtype=torch.float32)), 
    'dec_f2': nn.Parameter(std_weights * torch.randn(filter_shape_2,
                                           dtype=torch.float32)),
    'dec_f1': nn.Parameter(std_weights * torch.randn(filter_shape_1,
                                           dtype=torch.float32)),
}


biases = {
    'enc_b1': nn.Parameter(torch.zeros([n_map_1], dtype=torch.float32)),
    'enc_b2': nn.Parameter(torch.zeros([n_map_2], dtype=torch.float32)),
    'enc_b3': nn.Parameter(torch.zeros([n_map_3], dtype=torch.float32)),
    'enc_b4': nn.Parameter(torch.zeros([n_map_4], dtype=torch.float32)),
    'dec_b4': nn.Parameter(torch.zeros([n_map_3], dtype=torch.float32)),
    'dec_b3': nn.Parameter(torch.zeros([n_map_2], dtype=torch.float32)),
    'dec_b2': nn.Parameter(torch.zeros([n_map_1], dtype=torch.float32)),
    'dec_b1': nn.Parameter(torch.zeros([n_input_channel], dtype=torch.float32)),
}

last_layer = {
    'w': nn.Parameter(torch.randn([n_prototypes, n_classes],
                                       dtype=torch.float32))
}


### Print shapes of all parameters

In [6]:
# Printing shapes of all parameters
print("weights")
for weight in weights.keys():
    print(weight, weights[weight].shape)
print("biases")
for b in biases.keys():
    print(b, biases[b].shape)
print("last_layer")
print(last_layer['w'].shape)

weights
enc_f1 torch.Size([32, 1, 3, 3])
enc_f2 torch.Size([32, 32, 3, 3])
enc_f3 torch.Size([32, 32, 3, 3])
enc_f4 torch.Size([10, 32, 3, 3])
dec_f4 torch.Size([10, 32, 3, 3])
dec_f3 torch.Size([32, 32, 3, 3])
dec_f2 torch.Size([32, 32, 3, 3])
dec_f1 torch.Size([32, 1, 3, 3])
biases
enc_b1 torch.Size([32])
enc_b2 torch.Size([32])
enc_b3 torch.Size([32])
enc_b4 torch.Size([10])
dec_b4 torch.Size([32])
dec_b3 torch.Size([32])
dec_b2 torch.Size([32])
dec_b1 torch.Size([1])
last_layer
torch.Size([15, 10])


## Layer functions

In [7]:
def conv_layer(input, filter, bias, strides, padding="VALID",
               nonlinearity = nn.ReLU()):
    conv = F.conv2d(input, filter, bias=bias, stride=strides,
       padding=padding)
    out = nonlinearity(conv)
    return out
#### STRIDE MUST BE TUPLE FOR TORCH, IS A LIST IN TENSORFLOW
#### PADDING IS DIFFERENT, TF USES SAME/VALID, TORCH A INT OR LIST OF INTS
### IS THE FILTER THE SAME AS WEIGHTS ARGUMENT FOR THE CONV2D?

# tensorflow's conv2d_transpose needs to know the shape of the output
def deconv_layer(input, filter, bias, strides, padding="VALID",
                 nonlinearity=nn.ReLU()):
    deconv = F.conv_transpose2d(input, filter, bias=bias, stride=strides,
                                padding=padding)
    out = nonlinearity(deconv)
    return out

def fc_layer(input, weight, bias, nonlinearity=nn.ReLU()):
    return nonlinearity(torch.mm(input, weight) + bias)

## Model construction

In [8]:
X = torch.empty(batch_size, n_input_channel, input_width, input_height)

### Encoder

In [9]:
PADDING_FLAG = 1
# eln means the output of the nth layer of the encoder
el1 = conv_layer(X, weights['enc_f1'], biases['enc_b1'], stride_1, PADDING_FLAG)
el2 = conv_layer(el1, weights['enc_f2'], biases['enc_b2'], stride_2, PADDING_FLAG)
el3 = conv_layer(el2, weights['enc_f3'], biases['enc_b3'], stride_3, PADDING_FLAG)
el4 = conv_layer(el3, weights['enc_f4'], biases['enc_b4'], stride_4, PADDING_FLAG)


l4_shape = el4.shape
#print("l4_shape", l4_shape)

flatten_size = l4_shape[1] * l4_shape[2] * l4_shape[3]
n_features = flatten_size

# feature vectors is the flattened output of the encoder
feature_vectors = torch.reshape(el4, shape=[-1, flatten_size])

# initialize the prototype feature vectors
prototype_feature_vectors = nn.Parameter(torch.empty(size=
                                        [n_prototypes, n_features],
                                        dtype=torch.float32).uniform_())

#print(prototype_feature_vectors.shape)

deconv_batch_size = torch.eye(feature_vectors.shape[0])

# this is necessary for prototype images evaluation
reshape_feature_vectors = torch.reshape(feature_vectors, shape=[-1, l4_shape[1],
   l4_shape[2], l4_shape[3]])

### Decoder

In [10]:
dl4 = deconv_layer(reshape_feature_vectors, weights['dec_f4'], biases['dec_b4'],
                   strides=stride_4, padding=PADDING_FLAG)
dl3 = deconv_layer(dl4, weights['dec_f3'], biases['dec_b3'],
                   strides=stride_3, padding=PADDING_FLAG)
dl2 = deconv_layer(dl3, weights['dec_f2'], biases['dec_b2'],
                   strides=stride_2, padding=PADDING_FLAG)
dl1 = deconv_layer(dl2, weights['dec_f1'], biases['dec_b1'],
                   strides=stride_1, padding=PADDING_FLAG,
                   nonlinearity=nn.Sigmoid())

In [11]:
'''
X_decoded is the decoding of the encoded feature vectors in X;
we reshape it to match the shape of the training input
X_true is the correct output for the autoencoder
'''
print(dl1.shape)

X_decoded = torch.reshape(dl1, shape=[-1, input_size])
X_true = torch.eye(X)


torch.Size([250, 1, 17, 17])


RuntimeError: shape '[-1, 784]' is invalid for input of size 72250

## Prototype distances

In [12]:
'''
prototype_distances is the list of distances from each x_i to every prototype
in the latent space
feature_vector_distances is the list of distances from each prototype to every x_i
in the latent space
'''
prototype_distances = list_of_distances(feature_vectors,
                                        prototype_feature_vectors)
prototype_distances = torch.eye(prototype_distances)
feature_vector_distances = list_of_distances(prototype_feature_vectors,
                                             feature_vectors)
feature_vector_distances = torch.eye(feature_vector_distances)

# the logits are the weighted sum of distances from prototype_distances
logits = torch.mm(prototype_distances, last_layer['w'])
probability_distribution = F.softmax(logits)

NameError: name 'tf' is not defined