In [1]:
from tools.numerical_gradient import *
from models.layers import *
from models.networks.vanilla_rnn import *
import numpy as np

# Let's try out some simple functions for numerical_gradient. #

We know the linear equation y = 3x should always return 3. Let's check it:

In [2]:
def linear(x, slope=3):
    return slope*x

slope = numerical_gradient_check_scalar(linear, 5)
print slope

2.99999999989


In [3]:
# Multi-argument function that passes back two streams of gradients.
# this is a sanity pre-check for LSTM's.
def multi(x,y):
    a = x*y
    b = x**2
    return a,b

multi_x_a = lambda x: multi(x,y)[0]
multi_y_a = lambda y: multi(x,y)[0]
multi_x_b = lambda x: multi(x,y)[1]
multi_y_b = lambda y: multi(x,y)[1]

x,y = np.array([[3,4,5],[1,2,3]], float), np.array([[3,2,1],[7,8,9]], float)
a,b = multi(x,y)
# dummy gradients
da = np.ones_like(a)
db = np.ones_like(b)

dx_num = numerical_gradient_check_multivar(multi_x_a, x, da) + numerical_gradient_check_multivar(multi_x_b, x, db)
dy_num = numerical_gradient_check_multivar(multi_y_a, y, da) + numerical_gradient_check_multivar(multi_y_b, y, db)

print "dx_num : ", dx_num
print "dy_num : ", dy_num

dx_num :  [[  8.99999977   9.99999975  10.99999972]
 [  8.99999977  11.9999997   14.99999962]]
dy_num :  [[ 2.99999992  3.9999999   4.99999987]
 [ 0.99999997  1.99999995  2.99999992]]


In [4]:
def multi_cubic_field(x):
    return np.array([x[0]**3 + x[1]**2, x[0]*2 + x[1]/12])
arr = np.array([3,4], dtype=np.float32)

def matrix_mult(x, b = np.array([[3,5],[2,1]])):
    return x.dot(b)
                
vector_field = numerical_gradient_check_multivar(multi_cubic_field, arr, np.ones_like(arr))
print vector_field

arr = np.array([[3,4],[1,2]], dtype=np.float32)
vector_field = numerical_gradient_check_multivar(matrix_mult, arr, np.ones_like(arr))
print vector_field

[ 29.03938293   8.09431076]
[[ 8.01086426  3.0040741 ]
 [ 8.01086426  3.0040741 ]]


In [5]:
def affine_transform(w, x, b):
    return x.dot(w) + b

x = np.array([[1,2]], float) # 1 x 2
w = np.array([[3,2,1],[1,2,5]], float) # 2 x 3
b = np.array([[1,5,7]], float) # 1 x 3
dummy = affine_transform(w,x,b)

fw = lambda w: affine_transform(w,x,b)
fx = lambda x: affine_transform(w,x,b)
fb = lambda b: affine_transform(w,x,b)

vector_field = numerical_gradient_check_multivar(fw, w, np.ones_like(dummy))
print "fw : ", vector_field
vector_field = numerical_gradient_check_multivar(fx, x, np.ones_like(dummy))
print "fx : ", vector_field
vector_field = numerical_gradient_check_multivar(fb, b, np.ones_like(dummy))
print "fb : ", vector_field

fw :  [[ 0.99999997  0.99999997  0.99999997]
 [ 1.99999995  1.99999995  1.99999995]]
fx :  [[ 5.99999985  7.9999998 ]]
fb :  [[ 0.99999997  0.99999997  0.99999997]]


# Let's do a linear regression check (OLS) #

In [6]:
def costFunction(X, Theta, y):
    '''
    m: # of samples
    :return: COST of current theta
    '''
    h = np.dot(X, Theta)
    cost = np.sum((h - y)**2) / 2 / h.shape[0]
    delta_weight = np.dot(X.T, h - y) / (X.shape[0])
    return cost, delta_weight

In [7]:
# Say we have 5 dimensions on X and 1 on Y, we have N = 10
N, X_size, Y_size = 10, 5, 1

X = np.random.random((N, X_size))
Y = np.random.random((N, Y_size))
Theta = np.random.random((X_size,1))
fTheta = lambda Theta: costFunction(X, Theta, Y)[0]

dTheta_num = numerical_gradient_check_multivar(fTheta, Theta, 1)
_, dTheta = costFunction(X, Theta, Y)

print dTheta_num
print dTheta
print norm_loss(dTheta_num, dTheta)

[[ 0.60761655]
 [ 0.48703559]
 [ 0.56804149]
 [ 0.53638075]
 [ 0.59080817]]
[[ 0.60761656]
 [ 0.4870356 ]
 [ 0.56804151]
 [ 0.53638077]
 [ 0.59080819]]
1.26323844376e-08


# word_embedding_forward/backward #

In [8]:
# Looks good to me
ans = np.array([[[3, 4, 7, 1],
                [3, 4, 7, 1],
                [1, 5, 9, 4]],

               [[1, 5, 9, 4],
                [1, 5, 9, 4],
                [1, 5, 9, 4]],

               [[4, 3, 2, 5],
                [3, 4, 7, 1],
                [4, 3, 2, 5]]])

x = np.array([[1,1,0], [0,0,0], [2,1,2]], int)
words = np.array([[1,5,9,4],[3,4,7,1],[4,3,2,5]])
arr = word_embedding_forward(words, x)

assert np.array_equal(ans, arr)
print arr.shape, "\n", arr

(3, 3, 4) 
[[[3 4 7 1]
  [3 4 7 1]
  [1 5 9 4]]

 [[1 5 9 4]
  [1 5 9 4]
  [1 5 9 4]]

 [[4 3 2 5]
  [3 4 7 1]
  [4 3 2 5]]]


In [9]:
dout = np.array([[[1,2,0,1],[3,2,9,1],[1,2,1,1]],
                 [[3,9,2,4],[1,9,9,0],[2,0,1,6]],
                 [[1,0,1,0],[0,1,0,5],[3,0,0,1]]])

arr = word_embedding_backward(dout, words, x)
ans = np.array([[  7.,  20.,  13.,  11.],
               [  4.,   5.,   9.,   7.],
               [  4.,   0.,   1.,   1.]])

assert np.array_equal(ans, arr)
print arr.shape, "\n", arr

(3, 4) 
[[  7.  20.  13.  11.]
 [  4.   5.   9.   7.]
 [  4.   0.   1.   1.]]


# Tanh Vanilla RNN_step Layer #

In [10]:
# Forward
"""
prev_h = (N,H)
x = (N, V)
W_hh = (H,H)
W_xh = (V,H)
b = (H,)
"""
N = 10
D = 3
H = 2
T = 1
V = 5

prev_h = np.random.random((N,H)) # N = 3, H = 5
x = np.random.random((N,V)) # N = 3, V = 4
W_hh = np.random.random((H,H)) # H = 5
W_xh = np.random.random((V,H)) # V = 4, H = 5
b = np.random.random((H,)) # H = 5

res = rnn_step_forward(prev_h, W_hh, x, W_xh, b) # N = 3, H = 5
gradients = np.random.random(res.shape)
print res.shape

(10, 2)


In [11]:
# Backward
fprev_h = lambda prev_h: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fW_hh = lambda W_hh: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fx = lambda x: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fW_xh = lambda W_xh: rnn_step_forward(prev_h, W_hh, x, W_xh, b)
fb = lambda b: rnn_step_forward(prev_h, W_hh, x, W_xh, b)

dprev_h_num = numerical_gradient_check_multivar(fprev_h, prev_h, gradients)
dW_hh_num = numerical_gradient_check_multivar(fW_hh, W_hh, gradients)
dx_num = numerical_gradient_check_multivar(fx, x, gradients)
dW_xh_num = numerical_gradient_check_multivar(fW_xh, W_xh, gradients)
db_num = numerical_gradient_check_multivar(fb, b, gradients)

dW_hh, dW_xh, dprev_h, dx, db = rnn_step_backward(prev_h, W_hh, x, W_xh, b, gradients)
print "dprev_h : ", norm_loss(dprev_h, dprev_h_num)
print "dW_hh : ", norm_loss(dW_hh, dW_hh_num)
print "dx : ", norm_loss(dx, dx_num)
print "dW_xh : ", norm_loss(dW_xh, dW_xh_num)
print "db : ", norm_loss(db, db_num)

dprev_h :  1.26320346526e-08
dW_hh :  1.26243358504e-08
dx :  1.26243329418e-08
dW_xh :  1.26241828239e-08
db :  1.26147634111e-08


# Tanh Vanilla RNN Layer #

In [12]:
# Forward
from tools.numerical_gradient import *
from models.layers import *
import numpy as np
"""
h0 = (N,H)
W_hh = (H,H)
x = (N,T,D)
W_xh = (D,H)
b = (H,)
"""
N = 10
D = 3
H = 2
T = 1

h0 = np.random.random((N,H)) * 2
W_hh = np.random.random((H,H)) * 2
x = np.random.random((N,T,D)) * 2
W_xh = np.random.random((D,H)) * 2
b = np.zeros((H,))
x_copy = x.copy()

h = rnn_forward(x, W_xh, W_hh, b, h0)

  if h0 != None: # Supply an h0 state.


In [13]:
# Backward
fx = lambda x: rnn_forward(x, W_xh, W_hh, b, h0)
fW_xh = lambda W_xh: rnn_forward(x, W_xh, W_hh, b, h0)
fW_hh = lambda W_hh: rnn_forward(x, W_xh, W_hh, b, h0)
fb = lambda b: rnn_forward(x, W_xh, W_hh, b, h0)
fh0 = lambda h0: rnn_forward(x, W_xh, W_hh, b, h0)

dx_num = numerical_gradient_check_multivar(fx, x, np.ones_like(h))
dW_xh_num = numerical_gradient_check_multivar(fW_xh, W_xh, np.ones_like(h))
dW_hh_num = numerical_gradient_check_multivar(fW_hh, W_hh, np.ones_like(h))
db_num = numerical_gradient_check_multivar(fb, b, np.ones_like(h))
dh0_num = numerical_gradient_check_multivar(fh0, h0, np.ones_like(h))

dW_hh, dW_xh, dx, db, dh0 = rnn_backward(x, W_xh, W_hh, b, h0, h, np.ones_like(h))
print dW_hh, dW_hh_num
print db, db_num
print "dW_hh : ", norm_loss(dW_hh, dW_hh_num)
print "dW_xh : ", norm_loss(dW_xh, dW_xh_num)
print "dx : ", norm_loss(dx, dx_num)
print "db : ", norm_loss(db, db_num)
print "dh0 : ", norm_loss(dh0, dh0_num)

[[ 0.0042479   0.00371023]
 [ 0.00221519  0.00078672]] [[ 0.0042479   0.00371023]
 [ 0.00221519  0.00078672]]
[ 0.0030691   0.00233127] [ 0.0030691   0.00233127]
dW_hh :  1.23252048922e-08
dW_xh :  1.32962453723e-08
dx :  1.26422152002e-08
db :  1.28893202866e-08
dh0 :  1.26197410925e-08


In [14]:
N, D, H = 3, 10, 4

x = np.linspace(-0.4, 0.7, num=N*D).reshape(N, D)
prev_h = np.linspace(-0.2, 0.5, num=N*H).reshape(N, H)
Wx = np.linspace(-0.1, 0.9, num=D*H).reshape(D, H)
Wh = np.linspace(-0.3, 0.7, num=H*H).reshape(H, H)
b = np.linspace(-0.2, 0.4, num=H)
next_h = rnn_step_forward(prev_h, Wh, x, Wx, b)
expected_next_h = np.asarray([
  [-0.58172089, -0.50182032, -0.41232771, -0.31410098],
  [ 0.66854692,  0.79562378,  0.87755553,  0.92795967],
  [ 0.97934501,  0.99144213,  0.99646691,  0.99854353]])

print next_h

[[-0.58172089 -0.50182032 -0.41232771 -0.31410098]
 [ 0.66854692  0.79562378  0.87755553  0.92795967]
 [ 0.97934501  0.99144213  0.99646691  0.99854353]]


# The Monster LSTM_Step #

In [7]:
""" Forward pass """
""" Stealing one from the books """
N, V, H = 10,30,20
x = np.linspace(-0.4, 1.2, num=N*V).reshape(N, V)
prev_h = np.linspace(-0.3, 0.7, num=N*H).reshape(N, H)
prev_c = np.linspace(-0.4, 0.9, num=N*H).reshape(N, H)
W_xh = np.linspace(-2.1, 1.3, num=4*V*H).reshape(V, 4 * H)
W_hh = np.linspace(-0.7, 2.2, num=4*H*H).reshape(H, 4 * H)
b = np.linspace(0.3, 0.7, num=4*H)

cache, c, h = lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)

In [8]:
# Backwards!
# So LSTM's are very complicated because they have 2 passes of gradients flowing backwards.
# however, we know by the additive principal of gradients, that the gradients can simply be
# added together to form the final gradient. Therefore, we will compute the gradient of c
# then the gradient of h, and add the two together and it SHOULD be equal to our step_backwards.

dc, dh = np.zeros(c.shape), np.random.random(h.shape)

fprev_h_c = lambda prev_h: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[1]
fW_hh_c = lambda W_hh: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[1]
fx_c = lambda x: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[1]
fW_xh_c = lambda W_xh: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[1]
fb_c = lambda b: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[1]
fprev_c_c = lambda prev_c: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[1]

fprev_h_h = lambda prev_h: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[2]
fW_hh_h = lambda W_hh: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[2]
fx_h = lambda x: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[2]
fW_xh_h = lambda W_xh: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[2]
fb_h = lambda b: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[2]
fprev_c_h = lambda prev_c: lstm_step_forward(prev_h, W_hh, x, W_xh, b, prev_c)[2]

dprev_h_num = numerical_gradient_check_multivar(fprev_h_c, prev_h, dc) + \
              numerical_gradient_check_multivar(fprev_h_h, prev_h, dh)
dW_hh_num = numerical_gradient_check_multivar(fW_hh_c, W_hh, dc) + \
            numerical_gradient_check_multivar(fW_hh_h, W_hh, dh)
dx_num = numerical_gradient_check_multivar(fx_c, x, dc) + \
         numerical_gradient_check_multivar(fx_h, x, dh)
dW_xh_num = numerical_gradient_check_multivar(fW_xh_c, W_xh, dc) + \
            numerical_gradient_check_multivar(fW_xh_h, W_xh, dh)
db_num = numerical_gradient_check_multivar(fb_c, b, dc) + \
         numerical_gradient_check_multivar(fb_h, b, dh)
dprev_c_num = numerical_gradient_check_multivar(fprev_c_c, prev_c, dc) + \
              numerical_gradient_check_multivar(fprev_c_h, prev_c, dh)

dW_hh, dW_xh, dprev_h, dx, db, dprev_c = lstm_step_backward(W_hh, x, W_xh, b, cache, dh, dc)

print "dprev_h : ", norm_loss(dprev_h, dprev_h_num)
print "dW_hh : ", norm_loss(dW_hh, dW_hh_num)
print "dx : ", norm_loss(dx, dx_num)
print "dW_xh : ", norm_loss(dW_xh, dW_xh_num)
print "db : ", norm_loss(db, db_num)
print "dprev_c : ", norm_loss(dprev_c, dprev_c_num)

dprev_h :  1.2631887577e-08
dW_hh :  1.26313967883e-08
dx :  1.26331718809e-08
dW_xh :  1.26318084358e-08
db :  1.26278195781e-08
dprev_c :  1.26321386015e-08


# LSTM pass #

In [6]:
from tools.numerical_gradient import *
from models.layers import *
from models.networks.vanilla_rnn import *
import numpy as np

N, V, H, T = 3, 5, 4, 6
x = np.linspace(-0.4, 0.6, num=N*T*V).reshape(N, T, V)
h0 = np.linspace(-0.4, 0.8, num=N*H).reshape(N, H)
W_xh = np.linspace(-0.2, 0.9, num=4*V*H).reshape(V, 4 * H)
W_hh = np.linspace(-0.3, 0.6, num=4*H*H).reshape(H, 4 * H)
b = np.linspace(0.2, 0.7, num=4*H)

caches, h = lstm_forward(x, W_xh, W_hh, b, h0)

In [7]:
gradient = np.random.random(h.shape)

fx = lambda x: lstm_forward(x, W_xh, W_hh, b, h0)[1]
fW_xh = lambda W_xh: lstm_forward(x, W_xh, W_hh, b, h0)[1]
fW_hh = lambda W_hh: lstm_forward(x, W_xh, W_hh, b, h0)[1]
fb = lambda b: lstm_forward(x, W_xh, W_hh, b, h0)[1]
fh0 = lambda h0: lstm_forward(x, W_xh, W_hh, b, h0)[1]

dx_num = numerical_gradient_check_multivar(fx, x, gradient)
dW_xh_num = numerical_gradient_check_multivar(fW_xh, W_xh, gradient)
dW_hh_num = numerical_gradient_check_multivar(fW_hh, W_hh, gradient)
db_num = numerical_gradient_check_multivar(fb, b, gradient)
dh0_num = numerical_gradient_check_multivar(fh0, h0, gradient)

dW_hh, dW_xh, dx, db, dh0 = lstm_backward(x, W_xh, W_hh, b, h0, caches, gradient)

print "dW_hh : ", norm_loss(dW_hh, dW_hh_num)
print "dW_xh : ", norm_loss(dW_xh, dW_xh_num)
print "dx : ", norm_loss(dx, dx_num)
print "db : ", norm_loss(db, db_num)
print "dh0 : ", norm_loss(dh0, dh0_num)

dW_hh :  1.26256273091e-08
dW_xh :  1.26401079848e-08
dx :  1.26374131558e-08
db :  1.26448931228e-08
dh0 :  1.26217940386e-08


# Affine Layer #

In [17]:
# Forward
"""
h = (N,H)
W_hy = (H,D)
b = (D,)
"""
h = np.random.random((3,5)) # N = 3, H = 5
W_hy = np.random.random((5,7)) # H = 5, D = 7
b = np.random.random((7,)) # D = 7

res = affine_forward(h, W_hy, b) # N = 3, D = 7
gradients = np.random.random(res.shape)
print res.shape

(3, 7)


In [18]:
fh = lambda h: affine_forward(h, W_hy, b)
fW_hy = lambda W_hy: affine_forward(h, W_hy, b)
fb = lambda b: affine_forward(h, W_hy, b)

dh_num = numerical_gradient_check_multivar(fh, h,gradients)
dW_hy_num = numerical_gradient_check_multivar(fW_hy, W_hy,gradients)
db_num = numerical_gradient_check_multivar(fb, b,gradients)

dh, dW_hy, db = affine_backward(h, W_hy, b, gradients)

print "dx : ", norm_loss(dh, dh_num)
print "dW_xh : ", norm_loss(dW_hy, dW_hy_num)
print "db : ", norm_loss(db, db_num)

dx :  1.26314633753e-08
dW_xh :  1.26305748999e-08
db :  1.26310626321e-08


# Affine layer for RNN's #

In [19]:
# Forward
"""
h = (N,T,H)
W_hy = (H,D)
b = (D,)
"""
h = np.random.random((3,7,5)) # N = 3, T = 7, H = 5
W_hy = np.random.random((5,7)) # H = 5, D = 7
b = np.random.random((7,)) # D = 7

res = rnn_affine_forward(h, W_hy, b) # N = 3, D = 7
print res.shape

(3, 7, 7)


In [21]:
gradients = np.random.random(res.shape)

fh = lambda h: rnn_affine_forward(h, W_hy, b)
fW_hy = lambda W_hy: rnn_affine_forward(h, W_hy, b)
fb = lambda b: rnn_affine_forward(h, W_hy, b)

dh_num = numerical_gradient_check_multivar(fh, h, gradients)
dW_hy_num = numerical_gradient_check_multivar(fW_hy, W_hy, gradients)
db_num = numerical_gradient_check_multivar(fb, b, gradients)

dh, dW_hy, db = rnn_affine_backward(h, W_hy, b, gradients)

print "dx : ", norm_loss(dh, dh_num)
print "dW_xh : ", norm_loss(dW_hy, dW_hy_num)
print "db : ", norm_loss(db, db_num)

dx :  1.26308770636e-08
dW_xh :  1.26311152626e-08
db :  1.26310625571e-08


# Softmax Layer - One of the most important functions in Deep Learning #

In [5]:
# Forward and backwards
"""
x = (N,D)
y = (N,)
"""
x = np.random.random((3,4)) # N = 3, D = 4
y = np.random.randint(4, size=3) # D = 4, N = 3

fx = lambda x: softmax(x, y)[0]

loss, dJ = softmax(x, y)

dJ_num = numerical_gradient_check_multivar(fx, x, 1)
print "dJ : ", norm_loss(dJ, dJ_num)

dJ :  1.26331639816e-08


# SVM Layer - The other most important functions in Deep Learning #

In [23]:
# Forward and backwards
"""
x = (N,D)
y = (N,)
"""
x = np.random.random((3,5)) # N = 3, D = 5
y = np.random.randint(5, size=3) # D = 5, N = 3

fx = lambda x: SVM(x, y)[0]

loss, dJ = SVM(x, y)

dJ_num = numerical_gradient_check_multivar(fx, x, 1)
print "dJ : ", norm_loss(dJ, dJ_num)

dJ :  1.26666177132e-08


# Softmax for RNN layer - the important function compatible for RNN's #

In [24]:
# Forward and backwards
"""
x = (N,T,D)
y = (N,)
"""
x = np.random.random((3,5,4)) # N = 3, T = 5, D = 4
y = np.random.randint(4, size=(3,5)) # D = 4, N = 3

fx = lambda x: rnn_softmax(x, y)[0]

loss, dJ = rnn_softmax(x, y)
dJ_num = numerical_gradient_check_multivar(fx, x, 1)
print "dJ : ", norm_loss(dJ, dJ_num)    

dJ :  1.26174307889e-08


# I needed regularization on my net so I dropped out of UCLA #

In [25]:
# Forward and backwards
"""
x = (N,T,D)
"""
x = np.ones((30,50,40), float) # N = 30, T = 50, D = 40
# by def, 30x50x40 = sum of x = 60,000
p = 0.6 # we should see a number around 36,000

# Numerical checking on this kind of stuff is sketch since we have randomness in the function
print np.sum(x)
x,_ = dropout_forward(x, p)
print np.sum(x)
# These should be relatively similar.

60000.0
59820.0


# The entire RNN forward() #

In [26]:
N, T, D, H = 2, 3, 4, 5

x = np.linspace(-0.1, 0.3, num=N*T*D).reshape(N, T, D)
h0 = np.linspace(-0.3, 0.1, num=N*H).reshape(N, H)
Wx = np.linspace(-0.2, 0.4, num=D*H).reshape(D, H)
Wh = np.linspace(-0.4, 0.1, num=H*H).reshape(H, H)
b = np.linspace(-0.7, 0.1, num=H)

h = rnn_forward(x, Wx, Wh, b, h0)
expected_h = np.asarray([
  [
    [-0.42070749, -0.27279261, -0.11074945,  0.05740409,  0.22236251],
    [-0.39525808, -0.22554661, -0.0409454,   0.14649412,  0.32397316],
    [-0.42305111, -0.24223728, -0.04287027,  0.15997045,  0.35014525],
  ],
  [
    [-0.55857474, -0.39065825, -0.19198182,  0.02378408,  0.23735671],
    [-0.27150199, -0.07088804,  0.13562939,  0.33099728,  0.50158768],
    [-0.51014825, -0.30524429, -0.06755202,  0.17806392,  0.40333043]]])
print h


[[[-0.42070749 -0.27279261 -0.11074945  0.05740409  0.22236251]
  [-0.39525808 -0.22554661 -0.0409454   0.14649412  0.32397316]
  [-0.42305111 -0.24223728 -0.04287027  0.15997045  0.35014525]]

 [[-0.55857474 -0.39065825 -0.19198182  0.02378408  0.23735671]
  [-0.27150199 -0.07088804  0.13562939  0.33099728  0.50158768]
  [-0.51014825 -0.30524429 -0.06755202  0.17806392  0.40333043]]]


# One pass #

In [27]:
N, D, V, H, T = 1,2,2,1,2

model = VanillaRNN(N, D, T, H, V)

np.random.seed(5)
# Set all model parameters to fixed values
for k, v in model.params.iteritems():
    model.params[k] = np.linspace(-1.4, 1.3, num=v.size).reshape(*v.shape)
    print k
    print model.params[k].shape

h0 = np.linspace(-1.5, 0.3, num=(N * H)).reshape(N, H)
captions = (np.arange(N * T) % V).reshape(N, T)
print h0
print captions
x = captions[:,:-1]
y = captions[:,1:]

loss, grads, _ = model.loss(x, y, h0)
print loss

b_rnn
(1,)
W_xh
(2, 1)
W_hy
(1, 2)
words
(2, 2)
b_affine
(2,)
W_hh
(1, 1)
[[-1.5]]
[[0 1]]
0.00495557253145
