## Reads MNIST data into numpy array

In [1]:
import h5py
import numpy as np

MNIST_dir = "MNIST_h5/60000.h5" #MNIST training images in hdf5 format relative file path

In [2]:
def read_h5(filepath):
    '''Reads MNIST training images and labels from the hdf5 file.
       Parameter
       ---------
       filepath : Path to the .h5 file | string
    '''
    file   = h5py.File(filepath, "r+") #open the hdf5 file
    images = np.array(file["/images"]).astype("uint8") #read the images dataset
    labels = np.array(file["/meta"]).astype("uint8")   #read the labels dataset (stored as meta)
    
    return (images,labels)

In [3]:
images, labels = read_h5(MNIST_dir)

In [4]:
def one_hot_encoder(label_arr):
    '''Returns the given MNIST labels from np arrays of integers to np array of one hot labels.
       Parameter
       ---------
       label_arr : np array of MNIST integer labels
    '''
    total_labels  = label_arr.shape[0] #get the total number of labels
    one_hot_label = np.zeros([total_labels, 10]) #10 for num of classes in MNIST
    
    for i in range(label_arr.shape[0]): #loop through all the labels
        
        one_hot_label[i][int(label_arr[i])] = 1.0 #the label value will be marked as 1.0 at that specific index
        
    return one_hot_label #returns the np one-hot label 
    
    

In [5]:
labels = one_hot_encoder(labels) #fetch the one-hot encoded labels
images = images.reshape(images.shape[0], 28,28,1)

## Capsule Network

**Parameters**

In [6]:
import tensorflow as tf

learning_rate = 1e-4
batch_size    = 10
epsilon       = 1e-10
epoch         = 5
height, width = 28,28
num_labels    = 10
first_caps_vlength = 8
routing_iteration = 3
m_plus = 0.9
m_minus = 0.1
lambda_ = 0.5

In [7]:
def squash(capsule):
    '''Note that the input is a [batch_size, 1152, 1, 8, 1] tensor. 
       I.e. there are 1152 8-d vectors in each batch.
       To squash the vectors, we specificy the dimension the vector is in. In this case, axis is -2.
    '''
    #The output vector is in dimension -2 
    dot_product = tf.reduce_sum(tf.square(capsule), axis=-2, keepdims=True) 
    scalar_factor = dot_product/(1 + dot_product)/tf.sqrt(dot_product + epsilon)
    vec_squashed = scalar_factor * capsule
    return vec_squashed

In [8]:
def routing(capsule_layer, num_capsules):
    
    W = tf.get_variable('Weight', shape=(1, num_capsules, num_labels, 8, 16))
    b = tf.get_variable('Bias', shape=(1,1,num_labels, 16,1 ))
    W = tf.tile(W, [tf.shape(capsule_layer)[0], 1, 1 ,1 ,1]) #tiling just makes a copy of the same weight variable for all the items in the batch. It is still the same weight.
    x = tf.tile(capsule_layer, [1, 1, 10, 1, 1])
    u_hat = tf.matmul(W,x, transpose_a=True) #[batch_size, 1152, 10, 16, 1]
    u_hat_stopped = tf.stop_gradient(u_hat, name='stopped_gradient')
    
    b_ij = tf.zeros([tf.shape(capsule_layer)[0], num_capsules, num_labels, 1, 1], dtype=tf.float32)
#     b_ij = tf.Variable(zeros, trainable=False)
    
    for r_iter in range(routing_iteration):
        
        c_ij = tf.nn.softmax(b_ij, axis=2)
        
        if r_iter == routing_iteration - 1:
            
            s_j = tf.multiply(c_ij, u_hat)
            
            s_j = tf.reduce_sum(s_j, axis=1, keepdims=True) + b
            
            v_j = squash(s_j)
            
        else:
            
            s_j = tf.multiply(c_ij, u_hat_stopped)
            #reducing the sum at axis 1 makes the capsules with highest coefficient to contribute more and the lowest coefficient capsules to contirbute less
            s_j = tf.reduce_sum(s_j, axis=1, keepdims=True) + b 
            v_j = squash(s_j)
            
            
            v_j_tiled = tf.tile(v_j, [1, num_capsules, 1, 1, 1]) #make a copy at the number of capsules axis in order to find the scalar product
            product = u_hat_stopped * v_j_tiled #[batch_size, 1152, 10, 16, 1]
            #by reducing the sum at axis 3, where the previous product produced new vectors, gives a scalar value.
            #Whichever capsules that agrees with each other will produce high valued vectors. Sum reduce would
            #add them all up together to bring a scalar value which then used for the softmax to enable the routing
            u_produce_v = tf.reduce_sum(product, axis=3, keepdims=True) #
            
            b_ij += u_produce_v
    
    return v_j

In [9]:
X = tf.placeholder(tf.float32, shape=(None, height, width,1))
Y = tf.placeholder(tf.float32, shape=(None, num_labels))

conv1 = tf.contrib.layers.conv2d(X, num_outputs=256, kernel_size=9, stride=1, padding='VALID', activation_fn=tf.nn.relu)
conv2 = tf.contrib.layers.conv2d(conv1, num_outputs=256, kernel_size=9, stride=2, padding='VALID', activation_fn=tf.nn.relu)

capsules = tf.reshape(conv2, (tf.shape(conv2)[0], -1, first_caps_vlength, 1))
num_capsules = 6*6*32
primary_caps = squash(capsules)
#each of these primary capsules is multiplied by a weight matrix.The weight matrix will change each 8-D vector
#to 16-D vectors. Furthermore, the number of capsules should also be reduced to 10.We would do that by dynamic routing.
#However, before that, we need to tile the 2nd index (starting from index 0) to 10. With that, the 1024 capsules
#can be reduced to 1 and remove that dimension. The process of reducing the capsules to 10 is called dynamic routing,
primary_caps = tf.reshape(primary_caps, shape=(tf.shape(capsules)[0], -1, 1, 8, 1 )) #create the extra dimension 
#primary_caps.shape = [batch_size, 1152, 1, 8, 1]

digits = routing(primary_caps, num_capsules)
digits = tf.squeeze(digits, axis=1) # [batch_size, 10, 16, 1]

v_lengths = tf.sqrt(tf.reduce_sum(tf.square(digits), axis=2, keepdims=True) + epsilon) #[batch_size,10, 1, 1]

max_l = tf.square(tf.maximum(0., m_plus - v_lengths))
max_r = tf.square(tf.maximum(0., v_lengths - m_minus))

max_l = tf.reshape(max_l, shape=(batch_size, -1))
max_r = tf.reshape(max_r, shape=(batch_size, -1))
T_c = Y

L_c = T_c * max_l + lambda_*(1-T_c)*max_r
margin_loss = tf.reduce_mean(tf.reduce_sum(L_c, axis=1)) #test without reduce mean later
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(margin_loss)


In [10]:
sess = tf.InteractiveSession()
init = tf.global_variables_initializer().run()

for epoch_iter in range(epoch):
    counter = 0
    loss = 0
    for i in range(0,60000, batch_size):
        
        loss += sess.run([margin_loss, optimizer], feed_dict={X:images[i:i+batch_size], Y:labels[i:i+batch_size]})[0]
        counter += 1
    
    print("The loss at epoch %d is %g"%(epoch_iter, loss/counter))
    # res = primary_caps.eval(feed_dict={X:images[:100], Y:labels[:100]})
sess.close()

The loss at epoch 0 is 0.0272354
The loss at epoch 1 is 0.00973266


KeyboardInterrupt: 