In [1]:
from tools.numerical_gradient import *
from models.layers import *
from models.networks.vanilla_rnn import *
import numpy as np

# Let's try out some simple functions for numerical_gradient. #

We know the linear equation y = 3x should always return 3. Let's check it:

In [3]:
def linear(x, slope=3):
    return slope*x

slope = numerical_gradient_check_scalar(linear, 5)
print slope

2.99999999989


In [3]:
def multi_quadratic(x):
    return x[0]**2 + x[1]
arr = np.array([2,2], float)

slope = numerical_gradient_check_multivar(multi_quadratic, arr)
print slope

[ 3.9999999   0.99999997]


In [4]:
def multi_cubic_field(x):
    return np.array([x[0]**3 + x[1]**2, x[0]*2 + x[1]/12])
arr = np.array([3,4], dtype=np.float32)

def matrix_mult(x, b = np.array([[3,5],[2,1]])):
    return x.dot(b)
                
vector_field = numerical_gradient_check_multivar(multi_cubic_field, arr)
print vector_field

arr = np.array([[3,4],[1,2]], dtype=np.float32)
vector_field = numerical_gradient_check_multivar(matrix_mult, arr)
print vector_field

[ 29.03938293   8.09431076]
[[ 8.01086426  3.0040741 ]
 [ 8.01086426  3.0040741 ]]


In [5]:
def affine_transform(w, x, b):
    return x.dot(w) + b

x = np.array([[1,2]], float) # 1 x 2
w = np.array([[3,2,1],[1,2,5]], float) # 2 x 3
b = np.array([[1,5,7]], float) # 1 x 3

fw = lambda w: affine_transform(w,x,b)
fx = lambda x: affine_transform(w,x,b)
fb = lambda b: affine_transform(w,x,b)

vector_field = numerical_gradient_check_multivar(fw, w)
print "fw : ", vector_field
vector_field = numerical_gradient_check_multivar(fx, x)
print "fx : ", vector_field
vector_field = numerical_gradient_check_multivar(fb, b)
print "fb : ", vector_field

fw :  [[ 0.99999997  0.99999997  0.99999997]
 [ 1.99999995  1.99999995  1.99999995]]
fx :  [[ 5.99999985  7.9999998 ]]
fb :  [[ 0.99999997  0.99999997  0.99999997]]


# word_embedding_forward/backward #

In [6]:
# Looks good to me
ans = np.array([[[3, 4, 7, 1],
                [3, 4, 7, 1],
                [1, 5, 9, 4]],

               [[1, 5, 9, 4],
                [1, 5, 9, 4],
                [1, 5, 9, 4]],

               [[4, 3, 2, 5],
                [3, 4, 7, 1],
                [4, 3, 2, 5]]])

x = np.array([[1,1,0], [0,0,0], [2,1,2]], int)
words = np.array([[1,5,9,4],[3,4,7,1],[4,3,2,5]])
arr = word_embedding_forward(words, x)

assert np.array_equal(ans, arr)
print arr.shape, "\n", arr

(3, 3, 4) 
[[[3 4 7 1]
  [3 4 7 1]
  [1 5 9 4]]

 [[1 5 9 4]
  [1 5 9 4]
  [1 5 9 4]]

 [[4 3 2 5]
  [3 4 7 1]
  [4 3 2 5]]]


In [7]:
dout = np.array([[[1,2,0,1],[3,2,9,1],[1,2,1,1]],
                 [[3,9,2,4],[1,9,9,0],[2,0,1,6]],
                 [[1,0,1,0],[0,1,0,5],[3,0,0,1]]])

arr = word_embedding_backward(dout, words, x)
ans = np.array([[  7.,  20.,  13.,  11.],
               [  4.,   5.,   9.,   7.],
               [  4.,   0.,   1.,   1.]])

assert np.array_equal(ans, arr)
print arr.shape, "\n", arr

(3, 4) 
[[  7.  20.  13.  11.]
 [  4.   5.   9.   7.]
 [  4.   0.   1.   1.]]


# Tanh Vanilla RNN_step Layer #

In [8]:
# Forward
"""
prev_h = (N,H)
x = (N, V)
W_hh = (H,H)
W_xh = (V,H)
b = (H,)
"""
prev_h = np.random.random((3,5)) # N = 3, H = 5
x = np.random.random((3,4)) # N = 3, V = 4
W_hh = np.random.random((5,5)) # H = 5
W_xh = np.random.random((4,5)) # V = 4, H = 5
b = np.random.random((5,)) # H = 5

res = rnn_step_forward(prev_h, W_hh, x, W_xh, b) # N = 3, H = 5
print res.shape

(3, 5)


In [9]:
# Backward
fprev_h = lambda prev_h: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fW_hh = lambda W_hh: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fx = lambda x: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fW_xh = lambda W_xh: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fb = lambda b: rnn_step_forward(prev_h, W_hh, x, W_xh, b)

dprev_h_num = numerical_gradient_check_multivar(fprev_h, prev_h)
dW_hh_num = numerical_gradient_check_multivar(fW_hh, W_hh)
dx_num = numerical_gradient_check_multivar(fx, x)
dW_xh_num = numerical_gradient_check_multivar(fW_xh, W_xh)
db_num = numerical_gradient_check_multivar(fb, b)

dW_hh, dW_xh, dprev_h, dx, db = rnn_step_backward(prev_h, W_hh, x, W_xh, b, np.ones_like(res))
print "dprev_h : ", norm_loss(dprev_h, dprev_h_num)
print "dW_hh : ", norm_loss(dW_hh, dW_hh_num)
print "dx : ", norm_loss(dx, dx_num)
print "dW_xh : ", norm_loss(dW_xh, dW_xh_num)
print "db : ", norm_loss(db, db_num)

dprev_h :  1.26218862526e-08
dW_hh :  1.26272394557e-08
dx :  1.26023289679e-08
dW_xh :  1.26237846596e-08
db :  1.26068527793e-08


# Tanh Vanilla RNN Layer #

In [10]:
# Forward
from tools.numerical_gradient import *
from models.layers import *
import numpy as np
"""
h0 = (N,H)
W_hh = (H,H)
x = (N,T,D)
W_xh = (D,H)
b = (H,)
"""
N = 3
D = 4
H = 5
T = 1

h0 = np.random.random((N,H))
W_hh = np.random.random((H,H))
x = np.random.random((N,T,D))
W_xh = np.random.random((D,H))
b = np.random.random((H,))

h = rnn_forward(x, W_xh, W_hh, b, h0)

print h.shape

(3, 1, 5)


  if h0 != None: # Supply an h0 state.


In [11]:
# Backward
fx = lambda x: rnn_forward(x, W_xh, W_hh, b, h0)
fW_xh = lambda W_xh: rnn_forward(x, W_xh, W_hh, b, h0)
fW_hh = lambda W_hh: rnn_forward(x, W_xh, W_hh, b, h0)
fb = lambda b: rnn_forward(x, W_xh, W_hh, b, h0)
fh0 = lambda h0: rnn_forward(x, W_xh, W_hh, b, h0)

dx_num = numerical_gradient_check_multivar(fx, x)
dW_xh_num = numerical_gradient_check_multivar(fW_xh, W_xh)
dW_hh_num = numerical_gradient_check_multivar(fW_hh, W_hh)
db_num = numerical_gradient_check_multivar(fb, b)
dh0_num = numerical_gradient_check_multivar(fh0, h0)

dW_hh, dW_xh, dx, db, dh0 = rnn_backward(x, W_xh, W_hh, b, h0, h, np.ones_like(h))
print "dW_hh : ", norm_loss(dW_hh, dW_hh_num)
print "dW_xh : ", norm_loss(dW_xh, dW_xh_num)
print "dx : ", norm_loss(dx, dx_num)
print "db : ", norm_loss(db, db_num)
print "dh0 : ", norm_loss(dh0, dh0_num)

dW_hh :  1.26005078165e-08
dW_xh :  1.26025025411e-08
dx :  1.25611031233e-08
db :  1.25909590028e-08
dh0 :  1.26174712937e-08


In [8]:
N, D, H = 3, 10, 4

x = np.linspace(-0.4, 0.7, num=N*D).reshape(N, D)
prev_h = np.linspace(-0.2, 0.5, num=N*H).reshape(N, H)
Wx = np.linspace(-0.1, 0.9, num=D*H).reshape(D, H)
Wh = np.linspace(-0.3, 0.7, num=H*H).reshape(H, H)
b = np.linspace(-0.2, 0.4, num=H)
next_h = rnn_step_forward(prev_h, Wh, x, Wx, b)
expected_next_h = np.asarray([
  [-0.58172089, -0.50182032, -0.41232771, -0.31410098],
  [ 0.66854692,  0.79562378,  0.87755553,  0.92795967],
  [ 0.97934501,  0.99144213,  0.99646691,  0.99854353]])

print next_h

 [[-0.58172089 -0.50182032 -0.41232771 -0.31410098]
 [ 0.66854692  0.79562378  0.87755553  0.92795967]
 [ 0.97934501  0.99144213  0.99646691  0.99854353]]


# Affine Layer #

In [2]:
# Forward
"""
h = (N,H)
W_hy = (H,D)
b = (D,)
"""
h = np.random.random((3,5)) # N = 3, H = 5
W_hy = np.random.random((5,7)) # H = 5, D = 7
b = np.random.random((7,)) # D = 7

res = affine_forward(h, W_hy, b) # N = 3, D = 7
print res.shape

(3, 7)


In [3]:
fh = lambda h: affine_forward(h, W_hy, b)
fW_hy = lambda W_hy: affine_forward(h, W_hy, b)
fb = lambda b: affine_forward(h, W_hy, b)

dh_num = numerical_gradient_check_multivar(fh, h)
dW_hy_num = numerical_gradient_check_multivar(fW_hy, W_hy)
db_num = numerical_gradient_check_multivar(fb, b)

dh, dW_hy, db = affine_backward(h, W_hy, b, np.ones_like(res))

print "dx : ", norm_loss(dh, dh_num)
print "dW_xh : ", norm_loss(dW_hy, dW_hy_num)
print "db : ", norm_loss(db, db_num)

dx :  1.263027982e-08
dW_xh :  1.2631957873e-08
db :  1.26310626518e-08


# Affine layer for RNN's #

In [4]:
# Forward
"""
h = (N,T,H)
W_hy = (H,D)
b = (D,)
"""
h = np.random.random((3,7,5)) # N = 3, T = 7, H = 5
W_hy = np.random.random((5,7)) # H = 5, D = 7
b = np.random.random((7,)) # D = 7

res = rnn_affine_forward(h, W_hy, b) # N = 3, D = 7
print res.shape

(3, 7, 7)


In [5]:
fh = lambda h: rnn_affine_forward(h, W_hy, b)
fW_hy = lambda W_hy: rnn_affine_forward(h, W_hy, b)
fb = lambda b: rnn_affine_forward(h, W_hy, b)

dh_num = numerical_gradient_check_multivar(fh, h)
dW_hy_num = numerical_gradient_check_multivar(fW_hy, W_hy)
db_num = numerical_gradient_check_multivar(fb, b)

dh, dW_hy, db = rnn_affine_backward(h, W_hy, b, np.ones_like(res))

print "dx : ", norm_loss(dh, dh_num)
print "dW_xh : ", norm_loss(dW_hy, dW_hy_num)
print "db : ", norm_loss(db, db_num)

dx :  1.26309993476e-08
dW_xh :  1.26313237317e-08
db :  1.26310626624e-08


# Softmax Layer - One of the most important functions in Deep Learning #

In [2]:
# Forward and backwards
"""
x = (N,D)
y = (N,)
"""
x = np.random.random((3,4)) # N = 3, D = 4
y = np.random.randint(4, size=3) # D = 4, N = 3

fx = lambda x: softmax(x, y)[0]

loss, dJ = softmax(x, y)
dJ_num = numerical_gradient_check_multivar(fx, x)
print "dJ : ", norm_loss(dJ, dJ_num)

dJ :  1.26099759008e-08


# SVM Layer - The other most important functions in Deep Learning #

In [2]:
# Forward and backwards
"""
x = (N,D)
y = (N,)
"""
x = np.random.random((3,5)) # N = 3, D = 5
y = np.random.randint(5, size=3) # D = 5, N = 3

fx = lambda x: SVM(x, y)[0]

loss, dJ = SVM(x, y)
dJ_num = numerical_gradient_check_multivar(fx, x)
print "dJ : ", norm_loss(dJ, dJ_num)

dJ :  1.26399460897e-08


# Softmax for RNN layer - the important function compatible for RNN's #

In [3]:
# Forward and backwards
"""
x = (N,T,D)
y = (N,)
"""
x = np.random.random((3,5,4)) # N = 3, T = 5, D = 4
y = np.random.randint(4, size=(3,5)) # D = 4, N = 3

fx = lambda x: rnn_softmax(x, y)[0]

loss, dJ = rnn_softmax(x, y)
dJ_num = numerical_gradient_check_multivar(fx, x)
print "dJ : ", norm_loss(dJ, dJ_num)    

dJ :  1.26306575262e-08


# I needed regularization on my net so I dropped out of UCLA #

In [5]:
# Forward and backwards
"""
x = (N,T,D)
"""
x = np.ones((30,50,40), float) # N = 30, T = 50, D = 40
# by def, 30x50x40 = sum of x = 60,000
p = 0.6 # we should see a number around 36,000

# Numerical checking on this kind of stuff is sketch since we have randomness in the function
print np.sum(x)
x,_ = dropout_forward(x, p)
print np.sum(x)
# These should be relatively similar.

60000.0
mask sum :  36128.0
60213.3333333


# The entire RNN forward() #

In [10]:
N, T, D, H = 2, 3, 4, 5

x = np.linspace(-0.1, 0.3, num=N*T*D).reshape(N, T, D)
h0 = np.linspace(-0.3, 0.1, num=N*H).reshape(N, H)
Wx = np.linspace(-0.2, 0.4, num=D*H).reshape(D, H)
Wh = np.linspace(-0.4, 0.1, num=H*H).reshape(H, H)
b = np.linspace(-0.7, 0.1, num=H)

h = rnn_forward(x, Wx, Wh, b, h0)
expected_h = np.asarray([
  [
    [-0.42070749, -0.27279261, -0.11074945,  0.05740409,  0.22236251],
    [-0.39525808, -0.22554661, -0.0409454,   0.14649412,  0.32397316],
    [-0.42305111, -0.24223728, -0.04287027,  0.15997045,  0.35014525],
  ],
  [
    [-0.55857474, -0.39065825, -0.19198182,  0.02378408,  0.23735671],
    [-0.27150199, -0.07088804,  0.13562939,  0.33099728,  0.50158768],
    [-0.51014825, -0.30524429, -0.06755202,  0.17806392,  0.40333043]]])
print h

[[[-0.42070749 -0.27279261 -0.11074945  0.05740409  0.22236251]
  [-0.39525808 -0.22554661 -0.0409454   0.14649412  0.32397316]
  [-0.42305111 -0.24223728 -0.04287027  0.15997045  0.35014525]]

 [[-0.55857474 -0.39065825 -0.19198182  0.02378408  0.23735671]
  [-0.27150199 -0.07088804  0.13562939  0.33099728  0.50158768]
  [-0.51014825 -0.30524429 -0.06755202  0.17806392  0.40333043]]]


# One pass #

In [12]:
N, D, V, H, T = 1,2,2,1,1

model = VanillaRNN(N, D, T, H, V)

np.random.seed(5)
for key in model.params:
    model.params[key] = np.random.random(model.params[key].shape)
    print key, " : ", model.params[key]

model.loss(np.array([[1]]), np.array([[0]]))

b_rnn  :  [ 0.22199317]
W_xh  :  [[ 0.87073231]
 [ 0.20671916]]
W_hy  :  [[ 0.91861091  0.48841119]]
words  :  [[ 0.61174386  0.76590786]
 [ 0.51841799  0.2968005 ]]
b_affine  :  [ 0.18772123  0.08074127]
W_hh  :  [[ 0.7384403]]
