In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import time
import os
import math
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


Initialization and abstraction of data

In [2]:
def loadData():
    with np.load("notMNIST.npz") as data:
        Data, Target = data["images"], data["labels"]
        np.random.seed(521)
        randIndx = np.arange(len(Data))
        np.random.shuffle(randIndx)
        Data = Data[randIndx] / 255.0
        Target = Target[randIndx]
        trainData, trainTarget = Data[:10000], Target[:10000]
        validData, validTarget = Data[10000:16000], Target[10000:16000]
        testData, testTarget = Data[16000:], Target[16000:]
    return trainData, validData, testData, trainTarget, validTarget, testTarget


In [3]:
def convertOneHot(trainTarget, validTarget, testTarget):
    newtrain = np.zeros((trainTarget.shape[0], 10))
    newvalid = np.zeros((validTarget.shape[0], 10))
    newtest = np.zeros((testTarget.shape[0], 10))

    for item in range(0, trainTarget.shape[0]):
        newtrain[item][trainTarget[item]] = 1
    for item in range(0, validTarget.shape[0]):
        newvalid[item][validTarget[item]] = 1
    for item in range(0, testTarget.shape[0]):
        newtest[item][testTarget[item]] = 1
    return newtrain, newvalid, newtest


In [4]:
def shuffle(trainData, trainTarget):
    np.random.seed(421)
    randIndx = np.arange(len(trainData))
    target = trainTarget
    np.random.shuffle(randIndx)
    data, target = trainData[randIndx], target[randIndx]
    return data, target



In [5]:
trainData, validData, testData, trainTarget, validTarget, testTarget= loadData()
train_y, valid_y, test_y = convertOneHot( trainTarget, validTarget, testTarget)
#Train Data 
train_x = trainData.reshape(10000,784)
train_X = train_x.T
#X0 = np.ones((3500,1))
#train_X = np.append(X0,train_X,axis=1)

#Test Data
global test_X
test_X = testData.reshape(2724,784)


#Validation Data
global validation_X
validation_X = validData.reshape(6000,784)
print(train_X.shape)

(784, 10000)


Initialization of weights and biases

In [6]:
#weights and bias from input to hidden layer ---> W, b
mu = 0
sigma =1 /  math.sqrt(784 + 1000)
W = np.random.normal(mu, sigma, [784,1000])
bW = np.zeros((1,1000))


#weights and bias from hidden to output layer ---> V,bV
mu = 0
sigma =1/ math.sqrt(1000 + 10)
V = np.random.normal(mu, sigma, [1000,10])
bV = np.zeros((1,10))
#print(V.shape)
#print(bV.shape)

Helper Functions

In [7]:
def relu(x) : 
    return np.maximum(x,0)

In [8]:
def softmax(z) :
    b = np.amax(z,axis=0)
    e_pow = np.exp(z-b)
    summation = np.sum(e_pow,0)
    out = np.divide(e_pow,summation)
    return out

In [9]:
def computeLayer(X,W,b) :
    return np.dot(W.T,X) + b.T

In [10]:
### need to check inf * 0 problem
def CE(target, prediction):
    N = target.shape[0]
    #check assignment anouncement
    ### bug ###
    #prediction[prediction == 0] = 0.0000001
    log_term = np.log(prediction)
    CE_out = np.multiply(target,log_term)
    CE_out = (np.sum(CE_out)) / N
    return -1*CE_out

In [None]:
def gradCE(target, prediction):

    # TODO


Forward Propagation 

In [28]:
#forward propagation

def forward_pass(train_X,W,bW,V,bV) :
    S1 = computeLayer(train_X,W,bW)
    #print("S1 >> ",S1.shape)
    #print(S1)
    Y = relu(S1)
    #print(Y)
    #print("Y >> ",Y.shape)
    S2 = computeLayer(Y,V,bV)
    #print(S2)
    #print("S2 >> ",S2.shape)
    Z = softmax(S2)
    #print(Z)
    #print("Z >> ",Z.shape)

    #test = np.sum(Z,0)
    #for i in range(10000) :
    #    if(test[i] != 1) :
    #        print(test[i])

    Z_1 = Z.T

    loss_out = CE(train_y, Z_1)
    return loss_out,S1,Y,S2,Z,Z_1


All Gradients

In [53]:
#### Gradient wrt outer layer weights 
# dL/dV(i,j) = [ dL/ds2(j) ] * [ ds2(j)/dV(i,j) ]
# tot_Wo        = term_1 * term_2
# dL/ds2(j) = [ dL/dZ(j) ] * [ dZ(j)/dS2(j) ]
# term_1     = term_1_1 * term_1_2
def grad_wo(Y,Z_1,train_y,S2):
    term_2 = np.copy(Y)

    #print("term_2 >> ",term_2.shape)
    #out = (train_y / Z_1)
    term_1_1 = -1*(1/10000)*np.sum((train_y / Z_1),0)
    #print("term_1_1 >> ",term_1_1)
    
    b = np.amax(S2,axis=0)
    e_pow = np.exp(S2-b)
    summation = np.sum(e_pow,0)
    term_1_2 =  np.divide(e_pow,summation)
    #print("term_1_2 >> ",term_1_2.shape)
    
    ###########################
    ###### problem with o1
    ###### value large
    ###### multiplying by 1/10000 thats wrong
    ###### applies for all the gradient calculations
    ###########################
    o1 = np.matmul(term_1_2,term_2.T)
    #print("o1 >> ",o1)
    term_1_1_1 = term_1_1.reshape(10,1)
    #print("term_1_1_1 >> ",term_1_1_1)
    tot_Wo = np.multiply(o1.T,term_1_1_1.T)
    return (1/10000)*tot_Wo
    #print(tot_Wo)

In [47]:

#### Gradient wrt outer layer bias 
# dL/dV(i,j) = [ dL/ds2(j) ] * [ ds2(j)/dVb(i,j) ]
# tot_Wo        = term_1 * 1
# dL/ds2(j) = [ dL/dZ(j) ] * [ dZ(j)/dS2(j) ]
# term_1     = term_1_1 * term_1_2
def grad_bo(Z_1,S2,train_y):

    term_2 = np.ones((1,10000))


    out = (train_y / Z_1)
    term_1_1 = -1*(1/10000)*np.sum((train_y / Z_1),0)

    b = np.amax(S2,axis=0)
    e_pow = np.exp(S2-b)
    summation = np.sum(e_pow,0)
    term_1_2 =  np.divide(e_pow,summation)


    o1 = np.matmul(term_1_2,term_2.T)

    term_1_1_1 = term_1_1.reshape(10,1)
    tot_bo = np.multiply(o1.T,term_1_1_1.T)
    return (1/10000)*tot_bo

In [46]:
#### Gradient wrt hidden layer weight 
# dL/dW(i,j) = [ dL/ds1(j) ] * [ ds1(j)/dW(i,j) ]
# tot_Wh        = term_1 * term_2
def grad_wh(train_X,Y,tot_Wo,V):

    term_2 = np.copy(train_X)
    # dL/ds1(j) = [ dL/dY(j) ] * [ dY(j)/dS1(j) ]
    # term_1     = term_1_1 * term_1_2


    # dY(j)/dS1(j) = differentiation of relu 
    term_1_2 = np.copy(Y)
    term_1_2[term_1_2 > 0] = 1
    # dL/dY(j) = sumation([dL/dS2] * [dS2/dY])
    # term_1_1 = term_1_1_1 * term_1_1_2

    term_1_1_1 = np.copy(tot_Wo)
    term_1_1_2 = np.copy(V)
    ########
    ##check (1/1000)
    ########
    term_1_1 = np.sum(term_1_1_1*term_1_1_2,1)
    o1 = np.matmul(term_2,term_1_2.T)
    term_1_1_up = term_1_1.reshape(1000,1)
    tot_Wh = np.multiply(term_1_1_up.T,o1)
    return (1/10000)*tot_Wh

In [45]:
#### Gradient wrt hidden layer bias 
# dL/dW(i,j) = [ dL/ds1(j) ] * [ ds1(j)/db(i,j) ]
# tot_bh        = term_1 * term_2
def grad_bh(Y,tot_Wo,V):

    term_2 = np.ones((1,10000))
    # dL/ds1(j) = [ dL/dY(j) ] * [ dY(j)/dS1(j) ]
    # term_1     = term_1_1 * term_1_2

    # dY(j)/dS1(j) = differentiation of relu 
    term_1_2 = np.copy(Y)
    term_1_2[term_1_2 > 0] = 1
    # dL/dY(j) = sumation([dL/dS2] * [dS2/dY])
    # term_1_1 = term_1_1_1 * term_1_1_2

    term_1_1_1 = np.copy(tot_Wo)
    term_1_1_2 = np.copy(V)

    term_1_1 = np.sum(term_1_1_1*term_1_1_2,1)

    o1 = np.matmul(term_2,term_1_2.T)
    term_1_1_up = term_1_1.reshape(1000,1)
    tot_bh = np.multiply(term_1_1_up.T,o1)
    return (1/10000)*tot_bh

In [52]:
mV = np.full((1000,10),0.00001)
mW = np.full((784,1000),0.00001)
mbV = np.full((1,10),0.00001)
mb = np.full((1,1000),0.00001)
gamma = 0.9
alpha = 0.01


mu = 0
sigma =1 /  math.sqrt(784 + 1000)
W = np.random.normal(mu, sigma, [784,1000])
bW = np.zeros((1,1000))


#weights and bias from hidden to output layer ---> V,bV
mu = 0
sigma =1/ math.sqrt(1000 + 10)
V = np.random.normal(mu, sigma, [1000,10])
bV = np.zeros((1,10))

for i in range(2) :
    loss_out,S1,Y,S2,Z,Z_1 = forward_pass(train_X,W,bW,V,bV)
    print(loss_out)
    tot_Wo=grad_wo(Y,Z_1,train_y,S2)
    #print(tot_Wo)
    #print("-------")
    mV = gamma*mV + alpha*tot_Wo
    #print(V)
    #print("-------")
    V = V - mV
    #print(V)
    tot_bo=grad_bo(Z_1,S2,train_y)
    mbV = gamma*mbV + alpha*tot_bo
    bV = bV - mbV
    
    tot_Wh=grad_wh(train_X,Y,tot_Wo,V)
    mW = gamma*mW + alpha*tot_Wh
    W = W - mW
    tot_bh=grad_bh(Y,tot_Wo,V)
    mb = gamma*mb + alpha*tot_bh
    bW = bW - mb
    

2.337607699149269
2.3366547891555034
