### Import the necessary libraries and packages.
Tensorflow is used to extract and read the provided MNIST data only.

In [29]:
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
import matplotlib.pyplot as plt

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


### Define variables

In [30]:
learning_rate  = 0.0001
iteration      = 10000
batch          = 300
np.random.seed(2)

### Helper functions

In [31]:
def relu(x):
    '''
    x      : a batch sized array
    return : an identical array except that entries below 0 are replaced with 0
    '''
    return np.maximum(x, 0) 

In [32]:
def cross_entropy(y_hat, y, epsilon=1e-12):
    '''
    y_hat   : network's prediction (a batch sized array)
    y       : label (a batch sized array)
    epsilon : a small value added to avoid the possibility of having log(0) in computation
    return  : cost calculated using cross entropy formula
    '''    
    y_hat = np.clip(y_hat, epsilon, 1. - epsilon) #min value is epsilon and max value is (1 - epsilon)
    loss  = -np.sum(y*np.log(y_hat))/batch #cross entropy formula
    return loss

In [33]:
def softmax(z):
    '''
    input  : network's output layer result (a batch sized array)
    result : input transformed by the softmax function
    '''
    numerator   = np.exp(z)
    denominator = np.sum(np.exp(z), axis=1, keepdims=True)
    result      = numerator/denominator
    return result

### Define the weight variables

In [34]:
W1 = np.random.randn(785,300)
W2 = np.random.randn(301,10)

### Training session

In [35]:
loss = 0
for iteration in range(iteration):
    
    inp_batch = mnist.train.next_batch(batch) #(batch) num of training data and labels
    
    x = inp_batch[0] # training data
    y = inp_batch[1] # labels
    
    bias_neurons = np.ones(batch).reshape(batch, 1) # generate an array of ones
    x = np.hstack([x, bias_neurons]) #add the array of ones at the end of each training data
    
    z_1 = x.dot(W1) #hidden layer transform
    a_1 = relu(z_1) #hidden layer activation func
    
    a_1 = np.hstack([a_1, bias_neurons]) #add bias neurons
    
    z_2 = a_1.dot(W2) #output layer transformation
    
    y_hat = softmax(z_2) #softmax function
    
    cost = cross_entropy(y_hat, y) #calculate the total loss
    
    #Calculate the gradients (backpropagation)
    
    loss_wrt_output = (y_hat - y) #loss-softmax derivative
    w2_gradients    = np.dot(a_1.T, loss_wrt_output) #gradient of W2
    
    loss_wrt_z1     = np.dot(loss_wrt_output, W2.T)[a_1 <=0 ] = 1 #relu derivative where values <= 0 = 0  
    w1_gradients    = np.dot(x.T, loss_wrt_z1) #gradient of W1
    
    
    #gradient descent
    W1 -= learning_rate * w1_gradients
    W2 -= learning_rate * w2_gradients
    
    loss += cost
    
    if iteration%100 == 0 and iteration != 0:
        print("Iteration %d, loss %g" %(iteration, loss/100))
        loss = 0
    

Iteration 100, loss 13.2633
Iteration 200, loss 7.00887
Iteration 300, loss 5.57086
Iteration 400, loss 4.79792
Iteration 500, loss 4.36248
Iteration 600, loss 4.06921
Iteration 700, loss 3.79141
Iteration 800, loss 3.69227
Iteration 900, loss 3.36564
Iteration 1000, loss 3.33385
Iteration 1100, loss 3.08606
Iteration 1200, loss 3.02449
Iteration 1300, loss 2.94878
Iteration 1400, loss 2.88589
Iteration 1500, loss 2.77774
Iteration 1600, loss 2.66378
Iteration 1700, loss 2.66922
Iteration 1800, loss 2.52151
Iteration 1900, loss 2.47711
Iteration 2000, loss 2.45695
Iteration 2100, loss 2.33735
Iteration 2200, loss 2.29216
Iteration 2300, loss 2.2311
Iteration 2400, loss 2.21563
Iteration 2500, loss 2.13202
Iteration 2600, loss 2.076
Iteration 2700, loss 1.98545
Iteration 2800, loss 1.98158
Iteration 2900, loss 1.92334
Iteration 3000, loss 1.80341
Iteration 3100, loss 1.84611
Iteration 3200, loss 1.77371
Iteration 3300, loss 1.70392
Iteration 3400, loss 1.63931
Iteration 3500, loss 1.647