In [6]:
import numpy as np

In [7]:
def tanh(x, derv=False):
    if derv: return 1 - np.tanh(x)**2
    return np.tanh(x)

In [8]:
def derv_tanh(x):
    return 1 - x**2

In [9]:
def MSE(y, y_pred, derv=False):
    if derv: return 2*(y_pred-y)
    return np.mean((y_pred-y)**2)

In [10]:
INPUT_SIZE = 1
HIDDEN_SIZE = 50

In [11]:
Wx = np.random.randn(INPUT_SIZE, HIDDEN_SIZE) * np.sqrt(2/(INPUT_SIZE+HIDDEN_SIZE))
Wh = np.random.randn(HIDDEN_SIZE, HIDDEN_SIZE) * np.sqrt(1/HIDDEN_SIZE)

bh = np.zeros((1, HIDDEN_SIZE))

params = [Wx, Wh, bh]
len_params = len(params)

In [12]:
def update_params(grads, lr):
    for i in range(len_params):
        params[i] -= lr * grads[i]

In [13]:
def forward_cell(xt, h_prev):
    ht = tanh(xt @ Wx + h_prev @ Wh + bh)
    return ht

In [14]:
def forward(x):
    global H
    T = x.shape[0]

    H = np.zeros((T+1, HIDDEN_SIZE))

    for t in range(T):
        H[t+1] = forward_cell(x[t:t+1], H[t:t+1])
    
    return H[1:] # first doesn't count

In [15]:
forward(np.random.randn(32, INPUT_SIZE))

array([[ 0.04061895,  0.1512158 , -0.39896935, ...,  0.18427197,
         0.00766315, -0.09402148],
       [ 0.10986396,  0.33560756,  0.16270822, ...,  0.10644192,
         0.17815665,  0.25940096],
       [ 0.03510306, -0.31307622,  0.47600725, ...,  0.36293341,
         0.01041293,  0.29119811],
       ...,
       [ 0.06711985,  0.27441649, -0.0348259 , ...,  0.44030201,
        -0.67727833,  0.16354043],
       [ 0.5431602 ,  0.78339917, -0.37900372, ...,  0.23222014,
        -0.75629314, -0.27303233],
       [ 0.31577465,  0.41463568,  0.59907025, ...,  0.15891222,
         0.1152826 , -0.1204433 ]])

In [16]:
def backward_cell(dL, xt, ht, h_prev):
    dL *= derv_tanh(ht)

    dbh = np.sum(dL, axis=0, keepdims=True)

    dWx = xt.T @ dL
    dWh = h_prev.T @ dL

    dh_t = dL @ Wh.T
    dxt = dL @ Wx.T

    return dWx, dWh, dbh, dh_t, dxt

In [17]:
def backward(x, y_true, y_pred, learn=True, lr=0.001):
    T = x.shape[0]

    dWx = np.zeros_like(Wx)
    dWh = np.zeros_like(Wh)
    dbh = np.zeros_like(bh)

    dh_next = np.zeros((1, HIDDEN_SIZE))
    dx =  np.zeros_like(x)

    for t in reversed(range(T)):
        dL = MSE(y_true[t:t+1], y_pred[t:t+1], derv=True) + dh_next
        dWxt, dWht, dbht, dh_next, dx[t] = backward_cell(dL, x[t:t+1], H[t+1:t+2], H[t:t+1])

        dWx += dWxt; dWh += dWht; dbh += dbht
    
    dWx /= T; dWh /= T; dbh /= T

    grads = dWx, dWh, dbh

    if learn:
        update_params(grads, lr)

    return dx, grads

In [18]:
backward(
    x=np.random.randn(32, INPUT_SIZE),
    y_true=np.random.randn(32, HIDDEN_SIZE),
    y_pred=np.random.randn(32, HIDDEN_SIZE),
    learn=True,
    lr=0.001
)

(array([[  2.21836889],
        [ -7.61610215],
        [ 21.39992747],
        [-16.32424907],
        [  3.92139076],
        [  4.06652168],
        [ -3.69590651],
        [-12.96135489],
        [ 16.45571603],
        [  3.70971078],
        [ -2.91483322],
        [  1.59262533],
        [ -0.10275175],
        [  6.24988439],
        [  0.94776707],
        [ -5.25563092],
        [  2.23336114],
        [  3.40135861],
        [ -1.41230809],
        [ -0.66241665],
        [  8.38557864],
        [ 13.6989793 ],
        [ 10.68346322],
        [  0.8914597 ],
        [ -1.50276025],
        [ -5.92872839],
        [ 10.34404474],
        [  8.08176741],
        [  0.28259749],
        [  0.24358292],
        [ -7.97485956],
        [ -5.36405107]]),
 (array([[-0.35288612,  0.54202461, -0.96170264, -1.12584402, -0.15181505,
          -0.18445938, -0.30773252, -1.14915786,  0.88554284,  0.83684734,
           1.34289551, -0.86444255, -0.6586002 , -0.05894666,  0.60923191,
     