In [None]:
from __future__ import print_function
import os 
import numpy as np
import tensorflow as tf
import pandas as pd  
import matplotlib.pyplot as plt  
import sys
#You have freedom of using eager execution in tensorflow
#Instead of using With tf.Session() as sess you can use sess.run() whenever needed

plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [None]:
def softmax_loss(X, y):
# Forward pass
    N = int(X.shape[0])
    X -= tf.reduce_max(X, axis=1, keepdims=True)
    exp_vals = tf.exp(X)
    probs = exp_vals / tf.reduce_sum(exp_vals, axis=1, keepdims=True)
    loss = -tf.reduce_mean(tf.log(probs.eval()[range(N), y]))
    #loss = -np.mean(np.log(probs.eval()[range(N), y]))


# Backward pass
    dX = tf.convert_to_tensor(probs).eval()
    dX[range(N), y] -= 1
    dX /= N
    return loss, probs, dX

In [None]:
def computeCost(X,y,theta,reg):
    # WRITEME: write your code here to complete the routine
    ###########################################
    W1, b1, W2, b2, W3, b3 = theta[0], theta[1], theta[2], theta[3], theta[4], theta[5]
    # fc1
    z1 = tf.matmul(X,W1) + b1      
    # ReLU1
    h1 = tf.maximum(tf.cast(0,tf.float64), tf.cast(z1, tf.float64))   
    # fc2
    z2 = tf.matmul(h1,W2) + b2     
    # ReLU2
    h2 = tf.maximum(tf.cast(0,tf.float64), tf.cast(z2,tf.float64))   
    # fc3
    f = tf.matmul(h2,W3) + b3      
    # softmax
    data_loss, _, _ = softmax_loss(f, y) 
    reg_loss = 0.5 * reg * tf.reduce_sum(W1**2) + 0.5 * reg * tf.reduce_sum(W2**2) + 0.5 * reg * tf.reduce_sum(W3**2)
    loss = data_loss + reg_loss
    
    ###########################################
    
    return loss

In [None]:
def computeGrad(X,y,theta,reg): # returns nabla
    
    #############################################
    W1, b1, W2, b2, W3, b3 = theta[0], theta[1], theta[2], theta[3], theta[4], theta[5]
    # fc1
    z1 = tf.matmul(X,W1) + b1    
    # ReLU1  
    h1 = tf.maximum(tf.cast(0,tf.float64), tf.cast(z1,tf.float64))   
    # fc2
    z2 = tf.matmul(h1,W2) + b2     
    # ReLU2
    h2 = tf.maximum(tf.cast(0,tf.float64), tf.cast(z2,tf.float64))   
    # fc3
    f = tf.matmul(h2,W3) + b3      
    # softmax
    _, _, df = softmax_loss(f, y) 
    
    dh2 = tf.matmul(df,tf.transpose(W3))
    dz2 = tf.convert_to_tensor(dh2).eval()
    dz2[z2.eval() <= 0] = 0
    dh1 = tf.matmul(dz2,tf.transpose(W2))
    dz1 = tf.convert_to_tensor(dh1).eval()
    dz1[z1.eval() <= 0] = 0
    
    #############################################
    # WRITEME: write your code here to complete the routine
    
    
    ###############################################
    
    dW3 = tf.matmul(tf.transpose(h2), df) + reg * W3
    db3 = tf.reduce_sum(df, axis=0)
    dW2 = tf.matmul(tf.transpose(h1), dz2) + reg * W2
    db2 = tf.reduce_sum(dz2, axis=0)
    dW1 = tf.matmul(tf.transpose(X), dz1) + reg * W1
    db1 = tf.reduce_sum(dz1, axis=0)
    
    ################################################
    
    return (dW1,db1,dW2,db2,dW3,db3)

In [None]:
def predict(X,theta):
    # WRITEME: write your code here to complete the routine
    
    ############################################
    W1, b1, W2, b2, W3, b3 = theta[0], theta[1], theta[2], theta[3], theta[4], theta[5]
    z1 = tf.matmul(X,W1) + b1        # FC1
    h1 = tf.maximum(tf.cast(0,tf.float64),tf.cast(z1,tf.float64))     # ReLU1
    z2 = tf.matmul(h1,W2) + b2       # FC2
    h2 = tf.maximum(tf.cast(0,tf.float64),tf.cast(z2, tf.float64))     # ReLU2
    scores = tf.matmul(h2,W3) + b3   # FC3

    probs = tf.exp(scores - np.max(scores))
    probs /= tf.reduce_sum(probs)

    ###########################################
    return scores,probs

In [None]:
def create_mini_batch(X, y, start, end):
    # WRITEME: write your code here to complete the routine
    
    #############################
    mb_x = X[start : end, :]
    mb_y = y[start : end]
    #############################
    
    return (mb_x, mb_y)

In [None]:
def shuffle(X,y):
    ii = np.arange(X.shape[0])
    ii = np.random.shuffle(ii)
    X_rand = X[ii]
    y_rand = y[ii]
    X_rand = X_rand.reshape(X_rand.shape[1:])
    y_rand = y_rand.reshape(y_rand.shape[1:])
    return (X_rand,y_rand)

In [None]:
np.random.seed(0)
# Load in the data from disk
path = os.getcwd() + '/iris_train.dat'  
data = pd.read_csv(path, header=None)

In [None]:
# set X (training data) and y (target variable)
cols = data.shape[1]  
X = data.iloc[:,0:cols-1]  
y = data.iloc[:,cols-1:cols] 

In [None]:
# convert from data frames to numpy matrices
X = np.array(X.values)  
y = np.array(y.values)
y = y.flatten()
X_tf = tf.constant(X, dtype=tf.float64)
Y_tf = tf.constant(y, dtype=tf.float64)

In [None]:

# load in validation-set
path = os.getcwd() + '/iris_test.dat'
data = pd.read_csv(path, header=None) 
cols = data.shape[1]  
X_v = data.iloc[:,0:cols-1]  
y_v = data.iloc[:,cols-1:cols] 

X_v = np.array(X_v.values)  
y_v = np.array(y_v.values)
y_v = y_v.flatten()

X_V_tf = tf.constant(X_v)
Y_V_tf = tf.constant(y_v)

In [None]:
# initialize parameters randomly
D = X.shape[1]
K = np.amax(y) + 1

# initialize parameters randomly
h = 100 # size of hidden layer
h2 = 100 # size of hidden layer
initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01, seed=None, dtype=tf.float64)
W = tf.Variable(initializer([D, h]), dtype=tf.float64)
b = tf.Variable(tf.random_normal([h], dtype=tf.float64), dtype=tf.float64)
W2 = tf.Variable(initializer([h, h2]), dtype=tf.float64)
b2 = tf.Variable(tf.random_normal([h2], dtype=tf.float64),dtype=tf.float64)
W3 = tf.Variable(initializer([h2, K]),dtype=tf.float64)
b3 = tf.Variable(tf.random_normal([K],dtype=tf.float64),dtype=tf.float64)
theta = (W,b,W2,b2,W3,b3)



In [None]:
# some hyperparameters
n_e = 100
n_b = 10
step_size = 0.001 #1e-0
#reg = 0 #1e-3 # regularization strength
reg = 0.001
check =10
train_cost = []
valid_cost = []

In [None]:
# gradient descent loop
num_examples = X.shape[0]

Y_tf = tf.reshape(Y_tf, [num_examples, 1])
init = tf.initialize_all_variables()


with tf.Session() as sess:
    
    sess.run(init)
    for i in range(n_e):
        #X, y = tf.random_shuffle([X,y]) # re-shuffle the data at epoch start to avoid correlations across mini-batches
        # WRITEME: write your code here to perform a step of gradient descent & record anything else desired for later
        #          you can use the "check" variable to decide when to calculate losses and record/print to screen (as in previous sub-problems)

        ####################################
        train_loss = computeCost(X,y,theta,reg)
        valid_loss = computeCost(X_v,y_v,theta,reg)
        train_cost.append(train_loss.eval())
        valid_cost.append(valid_loss.eval())
        if True:
            s = "iteration %d: training loss = %.2f, validation loss = %.2f" % (i, train_loss.eval(), valid_loss.eval())
            print (s)
        
        #####################################
    
        # WRITEME: write the inner training loop here (1 full pass, but via mini-batches instead of using the full batch to estimate the gradient)
 
        s = 0
        while (s < num_examples):
            # build mini-batch of samples
            X_mb, y_mb = create_mini_batch(X,y,s,s + n_b)

            # WRITEME: gradient calculations and update rules go here
            #########################################
            
            theta = (W, b, W2, b2, W3, b3)
            dW1, db1, dW2, db2, dW3, db3 = computeGrad(X_mb,y_mb,theta,reg)
            W = W - step_size * dW1
            b = b - step_size * db1
            W2 = W2 - step_size * dW2
            b2 = b2 - step_size * db2
            W3 = W3 - step_size * dW3
            b3 = b3 - step_size * db3
            
            ########################################
            s += n_b

    print(' > Training loop completed!')
# TODO: remove this line below once you have correctly implemented/gradient-checked your various sub-routines
#sys.exit(0) 

    scores, probs = predict(X,theta)
    predicted_class = sess.run(tf.argmax(scores, axis=1))
    print ('training accuracy: %.2f%%' % (100*np.mean(predicted_class == y)))


    scores, probs = predict(X_v,theta)
    predicted_class = sess.run(tf.argmax(scores, axis=1))
    print ('validation accuracy: %.2f%%' % (100*np.mean(predicted_class == y_v)))

    # NOTE: write your plot generation code here (for example, using the "train_cost" and "valid_cost" list variables)

###########################################################

plt.plot(range(n_e), train_cost, range(n_e), valid_cost)
plt.legend(['training loss', 'validation loss'])
plt.ylabel('loss')
plt.xlabel('No. of epochs')
plt.savefig(os.getcwd() + '/DLAssign2Problem2b/Loss vs Epoch')

plt.show()


############################################################
