# RNN

In [225]:
from torch import nn
from torch.nn import functional as F
import math
import torch

### RNNのスクラッチ実装

##### • 入力層の次元数と，隠れ層の次元数を引数にとる
##### • 入力層と隠れ層の重みとバイアスをパラメータとして保持
##### • 初期値は に従う確率分布からランダムサンプル
##### • forwardメソッドに順伝搬を行う処理を記述
##### • input: [batch_size, seq_len, input_size]およびh_0:[1, batch_size, hidden_size]を引数にする
##### • 全stepの隠れ状態[batch_size, seq_len, hidden_size]および最後の
##### stepの隠れ状態[1, batch_size, hidden_size]を戻り値として返す
##### • 出力層は実装不要
##### • forwardメソッドの出力をnn.Linearに入力し最終的な予測値を計算する想定

In [237]:
class MyRNN:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size
        init_range = 1.0 / math.sqrt(hidden_size)
        self.W_in = torch.empty(hidden_size, input_size).uniform_(-init_range, init_range)
        self.W_h = torch.empty(hidden_size, hidden_size).uniform_(-init_range, init_range)
        
        self.b_in = torch.empty(hidden_size).uniform_(-init_range, init_range)
        self.b_h = torch.empty(hidden_size).uniform_(-init_range, init_range)

    def forward(self, input, h_0=None):
        # input: [batch_size, seq_len, input_size]
        batch_size, seq_len, _ = input.size()
        h_0 = h_0
        
        if h_0 is None:
            h_0 = torch.zeros(1, batch_size, self.hidden_size)#.to(device)

        outputs = []
        h = h_0 # [1, batch_size, hidden_size]
        for i in range(seq_len):
            # input[:, i] : [batch_size, input_size]
            h = torch.tanh(input[:, i] @ self.W_in.T + self.b_in + h.squeeze(0) @ self.W_h.T + self.b_h) # [batch_size, hidden_size] :
            
            outputs.append(h.unsqueeze(1)) # h : [batch_size, hidden_size] -> [batch_size, 1, hidden_size] 
        output_seq = torch.cat(outputs, dim=1) # h : [batch_size, seq_len, hidden_size] # 各単語相当の値を全て返すための処理
        h_n = h.unsqueeze(0) # [batch_size, hidden_size] -> [1, batch_size, hidden_size] # RNNの出力

        return output_seq, h_n

#### 補足

In [238]:
# 一様分布
hidden_size = 3
input_size = 5
W_in = torch.empty(hidden_size, input_size).uniform_()

init_range = 1.0/math.sqrt(hidden_size)
W_in.uniform_(-init_range, init_range)

tensor([[-0.5507, -0.3276, -0.2726, -0.0728, -0.0731],
        [ 0.5599,  0.5573, -0.0765,  0.3836,  0.4200],
        [-0.1853, -0.3739,  0.0751, -0.2020,  0.4832]])

In [239]:
# h_0の初期化
batch_size = 8
hidden_size = 3
h_0 = torch.zeros(1, batch_size, hidden_size)
h_0

tensor([[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]])

In [240]:
# 配列の確認
a = torch.randn(2, 3, 5)
print(a)
print()
print(a[:, 0, 0])
print()
print(a[:, 0])

tensor([[[ 7.5992e-01, -7.1731e-01, -1.1518e+00,  1.3113e+00, -1.7666e+00],
         [ 6.3005e-01,  3.0492e-01, -1.3523e+00,  2.9592e-01,  5.1149e-01],
         [ 1.5216e+00, -1.7167e+00,  8.2104e-01,  5.2866e-01,  6.0376e-01]],

        [[-1.3030e-01, -1.0487e+00,  1.2462e+00,  2.0516e-04,  1.1060e+00],
         [ 5.7418e-01,  1.7675e+00,  5.5719e-01,  8.6913e-01,  1.0264e+00],
         [ 5.9024e-02,  1.6412e+00,  2.1535e+00,  7.0747e-01,  1.0058e+00]]])

tensor([ 0.7599, -0.1303])

tensor([[ 7.5992e-01, -7.1731e-01, -1.1518e+00,  1.3113e+00, -1.7666e+00],
        [-1.3030e-01, -1.0487e+00,  1.2462e+00,  2.0516e-04,  1.1060e+00]])


### テスト

In [241]:
input_size = 10
hidden_size = 3
batch_size = 8
seq_len = 5

input_tensor = torch.randn(batch_size, seq_len, input_size)
rnn = MyRNN(input_size, hidden_size)
output_seq, h_n  = rnn.forward(input_tensor) 
print(output_seq.shape, h_n.shape)

torch.Size([8, 5, 3]) torch.Size([1, 8, 3])


### MyRNNモデル

In [242]:
class MyRNNModel():
    def __init__(self, input_size, hidden_size, output_size):
        self.rnn = MyRNN(input_size, hidden_size) 
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output_seq, h_n = self.rnn.forward(x) # [1, b, h_size]
        out = self.fc(h_n.squeeze(0)) # [b, out]
        return out

In [243]:
output_size = 2
model = MyRNNModel(input_size, hidden_size, output_size)
out = model.forward(input_tensor)
out.shape

torch.Size([8, 2])

### nn.RNN 

In [244]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True) 
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # output_seq : [batch_size, seq_len, hidden_size]
        # h_n :  [1, b, h_size]
        # out : [b, out]
        output_seq, h_n = self.rnn(x) 
        # out = self.fc(h_n.squeeze(0)) 
        out = self.fc(output_seq[:, -1, :]) # [batch_size, 1, hidden_size]
        # NER (many to many)
        # out = self.fc(output_seq)
        return out

In [245]:
output_size = 2
model = RNNModel(input_size, hidden_size, output_size)
out = model(input_tensor)
out.shape

torch.Size([8, 2])

In [246]:
input_size = 10
hidden_size = 3
batch_size = 8
seq_len = 5
model = RNNModel(input_size, hidden_size, output_size)
out = model(input_tensor)
out.size()

torch.Size([8, 2])

In [247]:
for name, param in model.named_parameters():
    print(f"{name}: {param.size()}")

rnn.weight_ih_l0: torch.Size([3, 10])
rnn.weight_hh_l0: torch.Size([3, 3])
rnn.bias_ih_l0: torch.Size([3])
rnn.bias_hh_l0: torch.Size([3])
fc.weight: torch.Size([2, 3])
fc.bias: torch.Size([2])


#### ↑ nn.Linear は最後の次元に対してのみ作用。つまり、入力テンソルの形状のうち、第1軸（0から数えると第2軸）には何も作用しない

### RNN baack propagation

In [248]:
class MyRNN:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size
        init_range = 1.0 / math.sqrt(hidden_size)
        self.W_in = torch.empty(hidden_size, input_size).uniform_(-init_range, init_range).clone().requires_grad_(True)
        self.W_h = torch.empty(hidden_size, hidden_size).uniform_(-init_range, init_range).clone().requires_grad_(True)
        
        self.b_in = torch.empty(hidden_size).uniform_(-init_range, init_range).clone().requires_grad_(True)
        self.b_h = torch.empty(hidden_size).uniform_(-init_range, init_range).clone().requires_grad_(True)

    def forward(self, input, h_0=None):
        # input: [batch_size, seq_len, input_size
        self.input = input
        batch_size, self.seq_len, _ = input.size()
        self.h_0 = h_0
        
        if h_0 is None:
            self.h_0 = torch.zeros(1, batch_size, self.hidden_size)#.to(device)

        outputs = []
        h = self.h_0 # [1, batch_size, hidden_size]
        for i in range(seq_len):
            # input[:, i] : [batch_size, input_size]
            h = torch.tanh(input[:, i] @ self.W_in.T + self.b_in + h.squeeze(0) @ self.W_h.T + self.b_h) # [batch_size, hidden_size] :
            
            outputs.append(h.unsqueeze(1)) # h : [batch_size, hidden_size] -> [batch_size, 1, hidden_size] 
        self.output_seq = torch.cat(outputs, dim=1) # h : [batch_size, seq_len, hidden_size] # 各単語相当の値を全て返すための処理
        h_n = h.unsqueeze(0) # [batch_size, hidden_size] -> [1, batch_size, hidden_size] # RNNの出力

        return self.output_seq, h_n

    def backward(self, out_grad):
        self.grad_W_in_list = []
        self.grad_W_h_list = []
        self.grad_b_in_list = []
        self.grad_b_h_list = []

        self.grad_h_list = []
        self.grad_h_tanh_list = []
   
        # 勾配の初期化
        grad_W_in = torch.zeros_like(self.W_in)
        grad_W_h = torch.zeros_like(self.W_h)
        grad_b_in = torch.zeros_like(self.b_in)
        grad_b_h = torch.zeros_like(self.b_h)
        grad_h = torch.zeros_like(self.h_0)

        
        # 各ステップの隠れ状態の初期化
        grad_output_seq = torch.zeros_like(self.output_seq) #[b, seq_len, hidden_size]
        grad_output_seq[:, -1, :] = out_grad

        
        # 各ステップにおける勾配を計算
        for i in reversed(range(self.seq_len)):

            # tanhの微分 (dh*(1-dh^2))
            grad_h_tanh = grad_output_seq[:, i] * (1 - self.output_seq[:, i].pow(2))
            grad_W_in += torch.sum(grad_h_tanh.unsqueeze(2) * self.input[:, i].unsqueeze(1), dim=0)  # バッチの合計を取る
            grad_b_in += torch.sum(grad_h_tanh, dim=0)
            grad_h = grad_h_tanh @ self.W_h
            grad_b_h += torch.sum(grad_h_tanh, dim=0)
            
            
            

            if i !=0 :
                # self.output_seqを使って計算
                grad_output_seq[:, i-1] = grad_h
                grad_W_h += torch.sum(grad_h_tanh.unsqueeze(2) * self.output_seq[:, i-1].squeeze(0).unsqueeze(1), dim=0) 
                
            else:
                # h_0を使って計算
                grad_W_h += torch.sum(grad_h_tanh.unsqueeze(2) * self.h_0.squeeze(0).unsqueeze(1), dim=0) 
               
               
               
    
            # 勾配を保存
            # テンソルのバックアップ: ある処理を行う前のテンソルの状態を保存しておきたい場合に、clone()を使ってバックアップを作成する
            self.grad_W_in_list.append(grad_W_in.clone())
            self.grad_W_h_list.append(grad_W_h.clone())
            self.grad_b_in_list.append(grad_b_in.clone())
            self.grad_b_h_list.append(grad_b_h.clone())
            self.grad_h_list.append(grad_h.clone())
            self.grad_h_tanh_list.append(grad_h_tanh.clone())


class MyRNNModel():
    def __init__(self, input_size, hidden_size, output_size):
        self.rnn = MyRNN(input_size, hidden_size) 
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        self.output_seq, self.h_n = self.rnn.forward(x) # [1, b, h_size]
        out = self.fc(self.h_n.squeeze(0)) # [b, out]
        return out

### テスト

In [249]:
input_size = 3
hidden_size = 2
batch_size = 1
seq_len = 5
output_size = 2 

# 正解ラベルの定義
target = torch.tensor([0])

input_tensor = torch.randn(batch_size, seq_len, input_size)
# モデルのインスタンス作成
model = MyRNNModel(input_size, hidden_size, output_size)

# forward
output = model.forward(input_tensor)



# 損失関数
criterion = nn.CrossEntropyLoss()


loss = criterion(output, target)
# 出力層の勾配計算
out_grad = torch.autograd.grad(loss, model.h_n, retain_graph=True)[0]
# スクラッチのbackward
model.rnn.backward(out_grad)

# autograd
loss.backward()


In [250]:
model.rnn.grad_W_in_list[-1]

tensor([[-0.0065,  0.0967, -0.0774],
        [-0.0359, -0.0586,  0.0447]], grad_fn=<CloneBackward0>)

In [251]:
model.rnn.W_in.grad

tensor([[-0.0065,  0.0967, -0.0774],
        [-0.0359, -0.0586,  0.0447]])

#### 補足

In [224]:
# reversed(range()) 
for i in reversed(range(5)):
    print(i)

# 外積の計算
a = torch.tensor([[1, 2], [3, 4], [5, 6]])
b = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

print(a)

print(b)

outer_product_list = [torch.ger(a_row, b_row) for a_row, b_row in zip(a, b)] 

print(outer_product_list )

print(a.unsqueeze(2) * b.unsqueeze(1))

4
3
2
1
0
tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
[tensor([[1, 2, 3],
        [2, 4, 6]]), tensor([[12, 15, 18],
        [16, 20, 24]]), tensor([[35, 40, 45],
        [42, 48, 54]])]
tensor([[[ 1,  2,  3],
         [ 2,  4,  6]],

        [[12, 15, 18],
         [16, 20, 24]],

        [[35, 40, 45],
         [42, 48, 54]]])
