# ITU YZV302(3)E Deep Learning Course Fall 2024

# HW3

# Q1: RNN and LSTM (10 pts)

In [1]:
import numpy as np

from DL.layer.recurrent_layers import RNNLayer, LSTMLayer
from DL.checks import rel_error, grad_check

%load_ext autoreload
%autoreload 2

## RNN [5 pts]

Implement a simple RNNLayer in "DL/layer/recurrent_layers.py" . Learnable parameters are $W_x, W_h$ and $b$ which are set during initialization. Dimensions of parameters are given in comments.
RNN layer should compute:

$h^{(t)} = tanh(b + W_hh^{t-1} + W_xx^{t})$

After your implementation, you can test your code for each method by using the tester functions below. Each function should return True.

### Forward Step [1 pt]

In [2]:
def rnn_test_forward_step():
    N, D, H = 3, 10, 4
    rnn = RNNLayer(10, 4)
    x = np.linspace(-0.4, 0.7, num=N*D).reshape(N, D)
    prev_h = np.linspace(-0.2, 0.5, num=N*H).reshape(N, H)
    rnn.Wx = np.linspace(-0.1, 0.9, num=D*H).reshape(D, H)
    rnn.Wh = np.linspace(-0.3, 0.7, num=H*H).reshape(H, H)
    rnn.b = np.linspace(-0.2, 0.4, num=H)

    next_h, _ = rnn.forward_step(x, prev_h)
    expected_next_h = np.array([
      [-0.58172089, -0.50182032, -0.41232771, -0.31410098],
      [ 0.66854692,  0.79562378,  0.87755553,  0.92795967],
      [ 0.97934501,  0.99144213,  0.99646691,  0.99854353]])

    print(rel_error(expected_next_h, next_h) < 1e-6)

rnn_test_forward_step()

True


### Forward [1.5 pts]

In [3]:
def rnn_test_forward():
    N, T, D, H = 2, 3, 4, 5
    rnn = RNNLayer(4,5)
    x = np.linspace(-0.1, 0.3, num=N*T*D).reshape(N, T, D)
    prev_h = np.linspace(-0.3, 0.1, num=N*H).reshape(N, H)
    rnn.Wx = np.linspace(-0.2, 0.4, num=D*H).reshape(D, H)
    rnn.Wh = np.linspace(-0.4, 0.1, num=H*H).reshape(H, H)
    rnn.b = np.linspace(-0.7, 0.1, num=H)

    h = rnn.forward(x, prev_h)
    expected_h = np.array([
      [
        [-0.42070749, -0.27279261, -0.11074945,  0.05740409,  0.22236251],
        [-0.39525808, -0.22554661, -0.0409454,   0.14649412,  0.32397316],
        [-0.42305111, -0.24223728, -0.04287027,  0.15997045,  0.35014525],
      ],
      [
        [-0.55857474, -0.39065825, -0.19198182,  0.02378408,  0.23735671],
        [-0.27150199, -0.07088804,  0.13562939,  0.33099728,  0.50158768],
        [-0.51014825, -0.30524429, -0.06755202,  0.17806392,  0.40333043]]])

    print(rel_error(expected_h[0], h[0]) < 1e-6)

rnn_test_forward()

True


### Backward Step [1 pt]

In [4]:
def rnn_test_backward_step():
    np.random.seed(145)
    N, D, H = 3, 10, 5
    rnn = RNNLayer(D, H)

    x = np.random.randn(N, D)
    prev_h = np.random.randn(N, H)
    rnn.Wx = np.random.randn(D, H)
    rnn.Wh = np.random.randn(H, H)
    rnn.b = np.random.randn(H)

    out, cache = rnn.forward_step(x, prev_h)

    dnext_h = np.linspace(-0.2, 0.4, num=N*H).reshape(N, H)

    dx, dprev_h, dWx, dWh, db = rnn.backward_step(dnext_h, cache)
    f = lambda _: rnn.forward_step(x, prev_h)[0]


    dx_num = grad_check(f, x, dnext_h)
    dprev_h_num = grad_check(f, prev_h, dnext_h)
    dWx_num = grad_check(f, rnn.Wx, dnext_h)
    dWh_num = grad_check(f, rnn.Wh, dnext_h)
    db_num = grad_check(f, rnn.b, dnext_h)

    print(rel_error(dx_num, dx) < 1e-6)
    print(rel_error(dprev_h_num, dprev_h) < 1e-6)
    print(rel_error(dWx_num, dWx) < 1e-6)
    print(rel_error(dWh_num, dWh) < 1e-6)
    print(rel_error(db_num, db) < 1e-6)

rnn_test_backward_step()

True
True
True
True
True


### Backward [1.5 pts]

In [5]:
def rnn_test_backward():
    np.random.seed(145)

    N, D, T, H = 3, 10, 7, 5
    rnn = RNNLayer(D, H)

    x = np.random.randn(N, T, D)
    h0 = np.random.randn(N, H)
    rnn.Wx = np.random.randn(D, H)
    rnn.Wh = np.random.randn(H, H)
    rnn.b = np.random.randn(H)

    out = rnn.forward(x, h0)

    dnext_h = np.random.randn(*out.shape)

    rnn.backward(dnext_h)

    dx, dh0, dWx, dWh, db = rnn.grad['dx'], rnn.grad['dh0'], rnn.grad['dWx'], rnn.grad['dWh'], rnn.grad['db']

    f = lambda _: rnn.forward(x, h0)

    dx_num = grad_check(f, x, dnext_h)
    dh0_num = grad_check(f, h0, dnext_h)
    dWx_num = grad_check(f, rnn.Wx, dnext_h)
    dWh_num = grad_check(f, rnn.Wh, dnext_h)
    db_num = grad_check(f, rnn.b, dnext_h)

    print(rel_error(dx_num, dx) < 1e-6)
    print(rel_error(dh0_num, dh0) < 1e-6)
    print(rel_error(dWx_num, dWx) < 1e-6)
    print(rel_error(dWh_num, dWh) < 1e-6)
    print(rel_error(db_num, db) < 1e-6)

rnn_test_backward()

True
True
True
True
True


## LSTM [5 pts]

Implement a simple LSTMLayer in "DL/layer/recurrent\_layers.py" . Learnable parameters are $W_x, W_h$ and $b$ which are set during initialization. Dimensions of parameters are given in comments.
LSTM layer should compute:

$a = b + W_hh^{t-1} + W_xx^{t}$

$a = [a_i, a_f, a_o, a_g]$

$input = \sigma(a_i)$ , $forget = \sigma(a_f)$ , $output = \sigma(a_o)$ , $input\_gate = tanh(a_g)$

$c^{(t)} = forget \odot  c^{(t-1)} + input \odot input\_gate$

$h^{(t)} = output \odot  tanh(c^{(t)})$

Note: forward function is used in order to obtain only hidden states for the input batch and it is assumed input batch is from the start of the sequence; therefore, cell state should be initialized to 0 and it is not necessary to return the resulting cell states.

After your implementation, you can test your code for each method by using the tester functions below. Each function should return True.

### Forward Step [1 pt]

In [7]:
def lstm_test_forward_step():
    N, D, H = 3, 4, 5
    lstm = LSTMLayer(4, 5)
    x = np.linspace(-0.4, 1.2, num=N*D).reshape(N, D)
    prev_h = np.linspace(-0.3, 0.7, num=N*H).reshape(N, H)
    prev_c = np.linspace(-0.4, 0.9, num=N*H).reshape(N, H)
    lstm.Wx = np.linspace(-2.1, 1.3, num=4*D*H).reshape(D, 4 * H)
    lstm.Wh = np.linspace(-0.7, 2.2, num=4*H*H).reshape(H, 4 * H)
    lstm.b = np.linspace(0.3, 0.7, num=4*H)

    next_h, next_c, _ = lstm.forward_step(x, prev_h, prev_c)

    expected_next_h = np.asarray([
        [ 0.24635157,  0.28610883,  0.32240467,  0.35525807,  0.38474904],
        [ 0.49223563,  0.55611431,  0.61507696,  0.66844003,  0.7159181 ],
        [ 0.56735664,  0.66310127,  0.74419266,  0.80889665,  0.858299  ]])
    expected_next_c = np.asarray([
        [ 0.32986176,  0.39145139,  0.451556,    0.51014116,  0.56717407],
        [ 0.66382255,  0.76674007,  0.87195994,  0.97902709,  1.08751345],
        [ 0.74192008,  0.90592151,  1.07717006,  1.25120233,  1.42395676]])

    print(rel_error(expected_next_h, next_h) < 1e-6)
    print(rel_error(expected_next_c, next_c) < 1e-6)

lstm_test_forward_step()

True
True


### Forward [1.5 pts]

In [8]:
def lstm_test_forward():
    N, D, H, T = 2, 5, 4, 3
    lstm = LSTMLayer(5, 4)
    x = np.linspace(-0.4, 0.6, num=N*T*D).reshape(N, T, D)
    h0 = np.linspace(-0.4, 0.8, num=N*H).reshape(N, H)
    lstm.Wx = np.linspace(-0.2, 0.9, num=4*D*H).reshape(D, 4 * H)
    lstm.Wh = np.linspace(-0.3, 0.6, num=4*H*H).reshape(H, 4 * H)
    lstm.b = np.linspace(0.2, 0.7, num=4*H)

    h = lstm.forward(x, h0)

    expected_h = np.asarray([
     [[ 0.01764008,  0.01823233,  0.01882671,  0.0194232 ],
      [ 0.11287491,  0.12146228,  0.13018446,  0.13902939],
      [ 0.31358768,  0.33338627,  0.35304453,  0.37250975]],
     [[ 0.45767879,  0.4761092,   0.4936887,   0.51041945],
      [ 0.6704845,   0.69350089,  0.71486014,  0.7346449 ],
      [ 0.81733511,  0.83677871,  0.85403753,  0.86935314]]])

    print(rel_error(expected_h[0], h[0]) < 1e-6)

lstm_test_forward()

True


### Backward Step [1 pt]

In [9]:
def lstm_test_backward_step():
    np.random.seed(132)

    N, D, H = 4, 5, 6
    lstm = LSTMLayer(5, 6)
    x = np.random.randn(N, D)
    prev_h = np.random.randn(N, H)
    prev_c = np.random.randn(N, H)
    lstm.Wx = np.random.randn(D, 4 * H)
    lstm.Wh = np.random.randn(H, 4 * H)
    lstm.b = np.random.randn(4 * H)

    next_h, next_c, cache = lstm.forward_step(x, prev_h, prev_c)

    dnext_h = np.random.randn(*next_h.shape)
    dnext_c = np.random.randn(*next_c.shape)

    f_h = lambda _: lstm.forward_step(x, prev_h, prev_c)[0]
    f_c = lambda _: lstm.forward_step(x, prev_h, prev_c)[1]

    dx_num = grad_check(f_h, x, dnext_h) + grad_check(f_c, x, dnext_c)
    dprev_h_num = grad_check(f_h, prev_h, dnext_h) + grad_check(f_c, prev_h, dnext_c)
    dprev_c_num = grad_check(f_h, prev_c, dnext_h) + grad_check(f_c, prev_c, dnext_c)
    dWx_num = grad_check(f_h, lstm.Wx, dnext_h) + grad_check(f_c, lstm.Wx, dnext_c)
    dWh_num = grad_check(f_h, lstm.Wh, dnext_h) + grad_check(f_c, lstm.Wh, dnext_c)
    db_num = grad_check(f_h, lstm.b, dnext_h) + grad_check(f_c, lstm.b, dnext_c)

    dx, dh, dc, dWx, dWh, db = lstm.backward_step(dnext_h, dnext_c, cache)

    print(rel_error(dx_num, dx) < 1e-6)
    print(rel_error(dprev_h_num, dh) < 1e-6)
    print(rel_error(dprev_c_num, dc) < 1e-6)
    print(rel_error(dWx_num, dWx) < 1e-6)
    print(rel_error(dWh_num, dWh) < 1e-6)
    print(rel_error(db_num, db) < 1e-6)

lstm_test_backward_step()

True
True
True
True
True
True


### Backward [1.5 pts]

In [10]:
def lstm_test_backward():
    np.random.seed(231)

    N, D, T, H = 2, 3, 10, 6

    lstm = LSTMLayer(3, 6)

    x = np.random.randn(N, T, D)
    h0 = np.random.randn(N, H)
    lstm.Wx = np.random.randn(D, 4 * H)
    lstm.Wh = np.random.randn(H, 4 * H)
    lstm.b = np.random.randn(4 * H)

    out = lstm.forward(x, h0)

    dnext_h = np.random.randn(*out.shape)

    lstm.backward(dnext_h)
    dx, dh0, dWx, dWh, db = lstm.grad['dx'], lstm.grad['dh0'], lstm.grad['dWx'], lstm.grad['dWh'], lstm.grad['db']

    f = lambda _: lstm.forward(x, h0)

    dx_num = grad_check(f, x, dnext_h)
    dh0_num = grad_check(f, h0, dnext_h)
    dWx_num = grad_check(f, lstm.Wx, dnext_h)
    dWh_num = grad_check(f, lstm.Wh, dnext_h)
    db_num = grad_check(f, lstm.b, dnext_h)

    print(rel_error(dx_num, dx) < 1e-6)
    print(rel_error(dh0_num, dh0) < 1e-6)
    print(rel_error(dWx_num, dWx) < 1e-6)
    print(rel_error(dWh_num, dWh) < 1e-6)
    print(rel_error(db_num, db) < 1e-6)

lstm_test_backward()

True
True
True
True
True
