## Vanilla RNN 만들어보기

- hidden_size : 10
- embedding_dim(input_dim) : 5
- output_size : 4

In [24]:
import numpy as np

def weight(hidden_size, embedding_dim, output_size):
    w_hh = np.random.randn(hidden_size, hidden_size)
    w_hx = np.random.randn(embedding_dim, hidden_size)
    w_hy = np.random.randn(hidden_size, output_size)

    return w_hh, w_hx, w_hy

def tanh(x):
    return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

def diff_tanh(x):
    return (1 - tanh(x))(1 + tanh(x))

def rnn_cell(x, h, vocab_size):
    hidden_size = h.shape[0]
    embedding_dim = x.shape[0]
    output_size = vocab_size

    w_hh, w_hx, w_hy = weight(hidden_size, embedding_dim, output_size)

    hidden_state = np.dot(h, w_hh) + np.dot(x, w_hx)
    hidden_state = tanh(hidden_state)

    output = np.dot(hidden_state, w_hy)

    return hidden_state, output

x = np.array([1,2,3,4,5])
x2 = np.array([2,3,4,5,6])
h0 = np.zeros(10)
vocab_size = 4

h1, output = rnn_cell(x, h0, vocab_size)
print('hidden state at time step 1 : \n',h1 , '\n현재 time step의 입력에 대한 출력 : \n',output)

h2, output = rnn_cell(x2, h1, vocab_size)
print('hidden state at time step 2 : \n',h1 , '\n현재 time step의 입력에 대한 출력 : \n',output)


hidden state at time step 1 : 
 [ 1.         -0.99999393  0.76528793  0.99902636 -0.99818615  0.99999702
  0.99999998 -1.         -0.99973928  0.99958881] 
현재 time step의 입력에 대한 출력 : 
 [-2.77720736 -1.45362631  2.14917648 -0.94816106]
hidden state at time step 2 : 
 [ 1.         -0.99999393  0.76528793  0.99902636 -0.99818615  0.99999702
  0.99999998 -1.         -0.99973928  0.99958881] 
현재 time step의 입력에 대한 출력 : 
 [ 1.727604   -0.61691011 -1.6298249  -3.17500999]


In [None]:
class RNN:
    def __init__(self, wx, wh, b):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
        self.cache = None # 역전파에 사용할 중간 데이터

    def forward(self, x, h_prev):
        wx, wh, b = self.params
        t = np.matmul(h_prev, wh) + np.matmul(x, wx) + b
        h_next = np.tanh(t)

        self.cache = (x, h_prev, h_next)
        return h_next
    
    def backward(self, dh_next):
        wx, wh, b = self.params
        x, h_prev, h_next = self.cache

        dt = dh_next * (1 - h_next ** 2) # tanh 미분
        db = np.sum(dt, axis=0)
        dwh = np.dot(h_prev.T, dt) # shape : (H, N) x (N, H) = (H, H)
        dh_prev = np.dot(dt, wh.T) # shape : (N, H) x (H, H) = (N, H)
        dwx = np.dot(x.T, dt) # shape : (D, N) x (N, H) = (D, H)
        dx = np.dot(dt, wx.T) # shape : (N, H) x (H, D) = (N, D)

        self.grads[0][...] = dwx
        self.grads[1][...] = dwh
        self.grads[2][...] = db

        return dx, dh_prev
    

class TimeRNN:
    def __init__(self, wx, wh, b, stateful=False):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
        self.layers = None # RNN 계층을 리스트로 저장

        self.h, self.dh = None, None
        self.stateful = stateful
    
    def set_state(self, h):
        # hidden state(h)를 설정하는 메서드
        self.h = h
    
    def reset_state(self):
        # hidden state(h)를 초기화하는 메서드
        self.h = None
    
    def forward(self, xs):
        wx, wh, b = self.params
        N, T, D = xs.shape # N(batch), T(time steps == sequence length), D(input size == embedding size)
        D, H = wx.shape

        self.layers=[]
        hs = np.empty((N, T, H), dtype='f')

        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        
        for t in range(T):
            layer = RNN(*self.params)
            self.h = layer.forward(xs[:, t, :], self.h)
            hs[:, t, :] = self.h
            self.layers.append(layer)
        
        return hs
    
    def backward(self, dhs):
        wx, wh, b = self.params
        N, T, H = dhs.shape
        D, H = wx.shape

        dxs = np.empty((N, T, D), dtype='f')
        dh = 0
        grads = [0,0,0]

        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh = layer.backward(dhs[:, t, :] + dh) # 합산된 기울기
            dxs[:, t, :] = dx

            for i, grad in enumerate(layer.grads):
                grads[i] += grad
        
        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dh = dh
    
        return dxs