In [1]:
import os 
import numpy as np
import tensorflow as tf
import pandas as pd  
import matplotlib.pyplot as plt  
import sys
#You have freedom of using eager execution in tensorflow
#Instead of using With tf.Session() as sess you can use sess.run() whenever needed

plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'


In [None]:
'''
Submitted by - Pakhi Agarwal
Problem 1c: MLPs \& the XOR Problem
@author - Alexander G. Ororbia II and Ankur Mali
'''

In [2]:
def softmax_loss(X, y):
# Forward pass
    N = int(X.shape[0])
    X -= tf.reduce_max(X, axis=1, keepdims=True)
    exp_vals = tf.exp(X)
    probs = exp_vals / tf.reduce_sum(exp_vals, axis=1, keepdims=True)
    loss = -tf.reduce_mean(tf.log(probs))

# Backward pass
    dX = probs.eval()
    
    if isinstance(y, np.ndarray):
        y0 = y
    else:
        y0 = y.eval()
        
    for N_ in range(N):
        dX[N_][y0] -= 1
    dX /= N

    return loss, probs, dX

In [3]:
def computeCost(X,y,theta,reg):
    # WRITEME: write your code here to complete the routine
    #############################################
    W1, b1, W2, b2 = theta[0], theta[1], theta[2], theta[3]
    N = X.shape[0]
    XX = X.eval()
    
    #print type(W1)
    #print type(b1)
    #print type(X)

    z = np.matmul(XX,W1) + b1       # FC1
    h = tf.maximum(tf.cast(0,tf.float64), tf.cast(z,tf.float64))     # ReLU
    hh = h.eval()
    f = np.matmul(hh,W2) + b2       # FC2
    data_loss, _, _ = softmax_loss(f, y) # Softmax
    reg_loss = 0.5 * reg * np.sum(W1**2) + 0.5 * reg * np.sum(W2**2)
    
    loss = data_loss + reg_loss
    
    #############################################
    return loss

In [4]:
def computeNumGrad(X,y,theta,reg): # returns approximate nabla
    # WRITEME: write your code here to complete the routine
    eps = 1e-5
    
    theta_list = [[[1,2], [3,4]], [1,2], [3,4], [1,2]]

    nabla_n = []
    # NOTE: you do not have to use any of the code here in your implementation...
                
    theta_plus_eps = [[[1,2], [3,4]], [1,2], [3,4],[1,2]]
    theta_minus_eps = [[[1,2], [3,4]], [1,2],[3,4],[1,2]]
    
    ########################################
    for i in range(len(theta)):
        
        param = theta[i].eval()

        rep = np.nditer(param, flags=['multi_index'], op_flags=['readwrite'])
        
        param_grad = np.ndarray(param.shape)
        while not rep.finished:
        # parameters at (x+eps) and (x-eps)
            ix = rep.multi_index
            
        # Evaluating function at x+eps i.e f(x+eps)
            for ix_ in ix:
                theta_plus_eps[i][ix_] = param[ix_] + eps
                
            f_x_plus_eps = computeCost(X,y,theta_plus_eps,reg)
        # Reseting theta
            for ix_ in ix:
                theta_list[i][ix_] = param[ix_] - eps        
        # Evaluating function at x i.e f(x-eps)
            for ix_ in ix:
                theta_minus_eps[i][ix_] = param[ix_] - eps
            f_x_minus_eps = computeCost(X,y,theta_minus_eps,reg)
        # Reseting theta
            for ix_ in ix:
                theta_list[i][ix_] = param[ix_] + eps
        # gradient at x
            
            for ix_ in ix:
                param_grad[ix_] = (f_x_plus_eps.eval() - f_x_minus_eps.eval())/(2*eps)
        # Iterating over all dimensions
            rep.iternext()
        nabla_n.append(param_grad)  
        
    ######################################
             
    return tuple(nabla_n)

In [5]:
def computeGrad(X,y,theta,reg): # returns nabla
    
    # WRITEME: write your code here to complete the routine
    
    #######################################
    W1, b1, W2, b2 = theta[0], theta[1], theta[2], theta[3]
    # fc1
    z = tf.matmul(X,W1) + b1         
    # ReLU
    h = tf.maximum(tf.cast(0,tf.float64), tf.cast(z,tf.float64))       
    # fc2
    f = tf.matmul(h,W2) + b2         
    #softmax
    _, _, df = softmax_loss(f, y) 
    dh = tf.matmul(df,tf.transpose(W2))
    dz = tf.convert_to_tensor(dh, copy=True)
    dz[z <= 0] = 0
    # WRITEME: write your code here to complete the routine
    dW2 = tf.matmul(tf.transpose(h), df) + reg * W2
    db2 = tf.reduce_sum(df, axis=0)
    dW1 = tf.matmul(tf.transpose(X), dz) + reg * W1
    db1 = tf.reduce_sum(dz, axis=0)
    
    #######################################
    
    return (dW1,db2,dW2,db2)


In [6]:
def predict(X,theta):
    # WRITEME: write your code here to complete the routine
    #scores = 0.0
    #probs = 0.0
    ################################################
    
    W1, b1, W2, b2 = theta[0], theta[1], theta[2], theta[3]
    # fc1
    z = tf.matmul(X,W1) + b1         
    # ReLU
    h = tf.maximum(tf.cast(0,tf.float64), tf.cast(z,tf.float64))       
    # fc2
    scores = tf.matmul(h,W2) + b2    
    probs = tf.exp(scores - np.max(scores, axis=1, keepdims=True))
    probs /= tf.reduce_sum(probs, axis=1, keepdims=True)
    
    ###############################################
    
    return (scores,probs)

In [7]:
np.random.seed(0)
# Load in the data from disk
path = os.getcwd() + '/xor.dat'  
data = pd.read_csv(path, header=None) 


In [8]:
# set X (training data) and y (target variable)
cols = data.shape[1]  
X = data.iloc[:,0:cols-1]  
y = data.iloc[:,cols-1:cols] 


In [9]:
# convert from data frames to numpy matrices
X = np.array(X.values)  
y = np.array(y.values)
y = y.flatten()

X_tf = tf.constant(X)
Y_tf = tf.constant(y)
# initialize parameters randomly
D = X.shape[1]
K = np.amax(y) + 1


In [10]:
# initialize parameters in such a way to play nicely with the gradient-check! 
h = 6 #100 # size of hidden layer

initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01, seed=None, dtype=tf.float64)
W = tf.Variable(initializer([D, h]),dtype=tf.float64)
b = tf.Variable(tf.random_normal([h],dtype=tf.float64),dtype=tf.float64)
W2 = tf.Variable(initializer([h, K]),dtype=tf.float64)
b2 = tf.Variable(tf.random_normal([K],dtype=tf.float64),dtype=tf.float64)
theta = (W,b,W2,b2) 

In [12]:
# some hyperparameters

reg = 1e-3 # regularization strength
initialized = False
with tf.Session() as sess:
    
    if not initialized:
        tf.global_variables_initializer().run()
        initialized = True
           
    nabla_n = computeNumGrad(X_tf,Y_tf,theta,reg)
    nabla = computeGrad(X_tf,Y_tf,theta,reg)
    nabla_n = list(nabla_n)
    nabla = list(nabla)

    for jj in range(0,len(nabla)):
        is_incorrect = 0 # set to false
        grad = nabla[jj]
        grad_n = nabla_n[jj]
        grad_sub = tf.subtract(grad_n,grad)
        grad_add = tf.add(grad_n,grad)
        err = tf.div(tf.norm(grad_sub) , (tf.norm(grad_add)))
        if(err > 1e-8):
            print("Param {0} is WRONG, error = {1}".format(jj, sess.run(err)))
        else:
            print("Param {0} is CORRECT, error = {1}".format(jj, sess.run(err)))

<type 'list'>
<type 'list'>
<class 'tensorflow.python.framework.ops.Tensor'>


ValueError: setting an array element with a sequence.

In [None]:
# re-init parameters
h = 6 #100 # size of hidden layer

initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01, seed=None, dtype=tf.float64)
W = tf.Variable(initializer([D, h]),dtype=tf.float64)
b = tf.Variable(tf.random_normal([h],dtype=tf.float64),dtype=tf.float64)
W2 = tf.Variable(initializer([h, K]),dtype=tf.float64)
b2 = tf.Variable(tf.random_normal([K],dtype=tf.float64),dtype=tf.float64)
theta = (W,b,W2,b2)

In [None]:
# some hyperparameters
n_e = 100
check = 10 # every so many pass/epochs, print loss/error to terminal
step_size = 1.0
reg = 0.001 # regularization strength
loss_vs_epochs = []

In [None]:
# gradient descent loop
with tf.Session as sess:
    for i in xrange(n_e):
        # WRITEME: write your code here to perform a step of gradient descent & record anything else desired for later
        #####################################
        
        theta = (W1, b1, W2, b2)
        loss = computeCost(X,y,theta,reg)
        loss_vs_epochs.append(loss)
        
        #####################################
        #loss = 0.0
        if i % check == 0:
            print "iteration %d: loss %f" % (i, loss)

        # perform a parameter update
        # WRITEME: write your update rule(s) here
        
        ######################################################
        dW1, db1, dW2, db2 = computeGrad(X,y,theta,reg)
        W1 = W1 - step_size * dW1
        b1 = b1 - step_size * db1
        W2 = W2 - step_size * dW2
        b2 = b2 - step_size * db2
        ######################################################
 
# TODO: remove this line below once you have correctly implemented/gradient-checked your various sub-routines
#sys.exit(0) 

    scores, probs = predict(X,theta)
    predicted_class = sess.run(tf.argmax(scores, axis=1))
    print 'training accuracy: %.2f' % sess.run((tf.reduce_mean(predicted_class == y)))

plt.plot(loss_vs_epochs)
plt.ylabel('training loss')
plt.xlabel('epochs')
plt.title('loss_vs_epochs')
plt.savefig(os.getcwd() + '/DLAssign2Problem1c/Loss vs Epoch')
plt.show()