In [6]:
import torch
import torch.nn as nn

torch.manual_seed(45)
# 默认batch_size为1，只需要后续进行升维就可以unsqueeze()
input_size, sequence_length = 6, 4 # input_szie 特征大小为6  sequence_length 词元序列长度为4
input = torch.randn(sequence_length, input_size).unsqueeze(0) # 设置单词的长度为4，特征向量的维度是6，并且提升第0维度
# 此时的input_size的维度为：N*L*d 对应batch_first = True
input.shape  #N*L*d

torch.Size([1, 4, 6])

In [44]:
# 初始化隐藏层
# h_0 为初始时刻的隐状态。当RNN为单向RNN时，h_0 的形状应为 num_layers × N × h ；
# 当RNN为双向RNN时，h_0 的形状应为 ( 2 ⋅ num_layers ) × N × h。如不提供该参数的值，则默认为全0张量。
hidden_size = 3 
h_prev = torch.zeros(hidden_size).unsqueeze(0)  # 这里初始化的就是单层，h层数，N样batch_size
h_prev

tensor([[0., 0., 0.]])

In [16]:
# 调用pytorch的RNN api来验证结果
rnn = nn.RNN(input_size, hidden_size, batch_first=True) # 传入两个参数：特征大小input_size和隐藏层数hidden_size
rnn_output, state_final = rnn(input, h_prev)  # 返回两个结果，一个是output，另一个是最后时刻的h的值
print("PyTorch API output:")
print(rnn_output)
print(state_final)

(tensor([[[ 0.8493, -0.5992, -0.2526],
          [-0.6106, -0.1691,  0.5180],
          [ 0.6820,  0.9631,  0.0999],
          [ 0.0734, -0.5295,  0.9324]]], grad_fn=<TransposeBackward1>),
 tensor([[[ 0.0734, -0.5295,  0.9324]]], grad_fn=<StackBackward0>))

In [67]:
# 手写RNN_forward函数，实现RNN的计算原理（单向单层）
def rnn_forward(input, weight_ih, bias_ih, weight_hh, bias_hh, h_prev):
    # input的shape对应三个参数：batch_size, sequence_length, input_size
    batch_size, sequence_length, input_size = input.shape
    # h_dim代表隐藏层的维度，也就是隐藏层的层数 [0]是因为矩阵相乘第一个维度作为不抵消的维度
    h_dim = weight_ih.shape[0]  
    # 初始化一个状态（输出）矩阵
    h_out = torch.zeros(batch_size, sequence_length, h_dim) # h_dim就是hidden_size
    
    for t in range(sequence_length):
        x = input[:, t, :]  # 获取当前时刻的输入（每一个时刻对应的是一个词元向量）
        # x 的默认大小应该是 batch_size * input_size
        # weight_ih 的默认大小应该是 h_dim * input_size，所以要进行weight_ih的扩充，变成 batch_size*h_dim*input_size
        weight_ih_batch = weight_ih.unsqueeze(0).tile(batch_size, 1, 1)
        # weight_hh_batch 的大小应该是 batch_size*h_dim*h_dim
        weight_hh_batch = weight_hh.unsqueeze(0).tile(batch_size, 1, 1)
        # torch.bmm() 带有batch_size的矩阵相乘
        w_times_x = torch.bmm(weight_ih_batch, x.unsqueeze(2))  #此时的x应该是 batch_size*input_size*1
        #w_times_x 的最后的结果应该是batch_size*h_dim*1，然后可以squeeze(-1)掉
        w_times_x = w_times_x.squeeze(-1) # batch_size*h_dim
        
        # h_prve最开始是batch_size*hidden_size，现在需要把它扩充到batch_size*hidden_size*1
        w_times_h = torch.bmm(weight_hh_batch, h_prev.unsqueeze(2)).squeeze(-1)  # batch_size*h_dim 
        # 最终的计算并且不断更新
        h_prev = torch.tanh(w_times_x + bias_ih + w_times_h + bias_hh)
        h_out[:, t, :] = h_prev
        
    return h_out, h_prev.unsqueeze(0)

In [38]:
# 验证rnn_forward函数的准确性
for k,v in rnn.named_parameters():
    print(k, v)

weight_ih_l0 Parameter containing:
tensor([[-0.4625,  0.5091,  0.5695, -0.1479,  0.2251, -0.5359],
        [ 0.1143, -0.0672, -0.1061, -0.5320, -0.1582, -0.4026],
        [ 0.5427, -0.3619, -0.1699,  0.1750,  0.2764, -0.1850]],
       requires_grad=True)
weight_hh_l0 Parameter containing:
tensor([[ 0.1400,  0.3478,  0.3506],
        [-0.4183, -0.2382, -0.5081],
        [ 0.2111,  0.0751, -0.1017]], requires_grad=True)
bias_ih_l0 Parameter containing:
tensor([-0.3031,  0.4574,  0.5712], requires_grad=True)
bias_hh_l0 Parameter containing:
tensor([ 0.3753, -0.0053, -0.3025], requires_grad=True)


In [52]:
custom_rnn_output, custom_state_fina = rnn_forward(input, rnn.weight_ih_l0, rnn.bias_ih_l0, rnn.weight_hh_l0, rnn.bias_hh_l0, h_prev)
print("rnn_forward output:")
print(custom_rnn_output)
print(custom_state_fina)

rnn_forward output:
tensor([[[ 0.8493, -0.5992, -0.2526],
         [-0.6106, -0.1691,  0.5180],
         [ 0.6820,  0.9631,  0.0999],
         [ 0.0734, -0.5295,  0.9324]]], grad_fn=<CopySlices>)
tensor([[[ 0.0734, -0.5295,  0.9324]]], grad_fn=<UnsqueezeBackward0>)


In [77]:
# bidirectional_rnn_forward 函数手写实现，实现双向RNN计算
def bi_rnn_forward(input, weight_ih, bias_ih, weight_hh, bias_hh, h_prev, \
                   weight_ih_reverse, weight_hh_reverse, bias_ih_reverse, bias_hh_reverse, h_prev_reverse):
    # input的shape对应三个参数：batch_size, sequence_length, input_size
    batch_size, sequence_length, input_size = input.shape
    # h_dim代表隐藏层的维度，也就是隐藏层的层数 [0]是因为矩阵相乘第一个维度作为不抵消的维度
    h_dim = weight_ih.shape[0]  
    # 初始化一个状态（输出）矩阵
    h_out = torch.zeros(batch_size, sequence_length, h_dim*2) # h_dim就是hidden_size，现在应该是双层结构，所以是两倍
    
    
    # 两层都可以调用我们自己写的rnn_forward() 
    forward_output = rnn_forward(input, weight_ih, bias_ih, weight_hh, bias_hh, h_prev)[0]  # forward_layer
    # backward_layer 的input应该是相较于forward反向过来，具体的反向是input中的sequence（词元）进行反向
    backward_output = rnn_forward(torch.flip(input, [1]), weight_ih_reverse, bias_ih_reverse, \
                                   weight_hh_reverse, bias_hh_reverse, h_prev_reverse)[0]
    
    h_out[:, :, :h_dim] = forward_output
    h_out[:, :, h_dim:] = torch.flip(backward_output, [1])  #这里需要重新反转才能和forward_layout进行拼接
    
    return h_out, h_out[:, -1, :].reshape((batch_size, 2, h_dim)).transpose(0, 1)

In [78]:
# 验证bi_rnn_forward() 函数的正确性
# 首先重新实例化一个RNN
bi_rnn = nn.RNN(input_size, hidden_size, batch_first=True, bidirectional=True) # 传入两个参数：特征大小input_size和隐藏层数hidden_size
batch_size = 1  # 样本大小目前给的就是1
h_prev = torch.zeros(2, batch_size, hidden_size)
bi_rnn_output, bi_state_final = bi_rnn(input, h_prev)
print("PyTorch API output:")
print(bi_rnn_output)
print(bi_state_final)

PyTorch API output:
tensor([[[-0.6087,  0.0795,  0.3462,  0.9711, -0.9796, -0.6105],
         [ 0.2244, -0.6353,  0.0294, -0.1929, -0.9441,  0.5532],
         [-0.7982, -0.6700, -0.9200, -0.1233,  0.6410, -0.3867],
         [ 0.4198,  0.8892,  0.4646,  0.0647, -0.9828, -0.9097]]],
       grad_fn=<TransposeBackward1>)
tensor([[[ 0.4198,  0.8892,  0.4646]],

        [[ 0.9711, -0.9796, -0.6105]]], grad_fn=<StackBackward0>)


In [79]:
for k,v in bi_rnn.named_parameters():
    print(k, v)

weight_ih_l0 Parameter containing:
tensor([[ 0.5296,  0.0321,  0.1165, -0.2557,  0.2418,  0.4144],
        [-0.5435,  0.1551, -0.3544,  0.3543,  0.4802, -0.4232],
        [ 0.4194,  0.3039, -0.3030,  0.2087,  0.3193,  0.2257]],
       requires_grad=True)
weight_hh_l0 Parameter containing:
tensor([[-0.5647, -0.2097, -0.4985],
        [-0.5652,  0.1587, -0.0429],
        [ 0.2237,  0.5574,  0.0811]], requires_grad=True)
bias_ih_l0 Parameter containing:
tensor([-0.0479, -0.0370,  0.2689], requires_grad=True)
bias_hh_l0 Parameter containing:
tensor([-0.4902,  0.0525, -0.2702], requires_grad=True)
weight_ih_l0_reverse Parameter containing:
tensor([[-0.3424,  0.4879,  0.3612,  0.3389,  0.0303, -0.0915],
        [-0.5010, -0.4015,  0.0525, -0.4627, -0.5156, -0.5086],
        [-0.3423, -0.5647,  0.5610, -0.3788,  0.1533,  0.3255]],
       requires_grad=True)
weight_hh_l0_reverse Parameter containing:
tensor([[-0.3206, -0.3444, -0.0685],
        [ 0.5382,  0.1234,  0.4068],
        [-0.0759,  0

In [80]:
custom_bi_rnn_output, custom_bi_state_final = bi_rnn_forward(input, bi_rnn.weight_ih_l0, bi_rnn.bias_ih_l0, \
                                                            bi_rnn.weight_hh_l0, bi_rnn.bias_hh_l0, h_prev[0], \
                                                            bi_rnn.weight_ih_l0_reverse, bi_rnn.weight_hh_l0_reverse, \
                                                            bi_rnn.bias_ih_l0_reverse, bi_rnn.bias_hh_l0_reverse, h_prev[1])
print("bi_rnn_forward output:")
print(custom_bi_rnn_output)
print(custom_bi_state_final)
print("PyTorch API output:")
print(bi_rnn_output)
print(bi_state_final)

bi_rnn_forward output:
tensor([[[-0.6087,  0.0795,  0.3462,  0.9711, -0.9796, -0.6105],
         [ 0.2244, -0.6353,  0.0294, -0.1929, -0.9441,  0.5532],
         [-0.7982, -0.6700, -0.9200, -0.1233,  0.6410, -0.3867],
         [ 0.4198,  0.8892,  0.4646,  0.0647, -0.9828, -0.9097]]],
       grad_fn=<CopySlices>)
tensor([[[ 0.4198,  0.8892,  0.4646]],

        [[ 0.0647, -0.9828, -0.9097]]], grad_fn=<TransposeBackward0>)
PyTorch API output:
tensor([[[-0.6087,  0.0795,  0.3462,  0.9711, -0.9796, -0.6105],
         [ 0.2244, -0.6353,  0.0294, -0.1929, -0.9441,  0.5532],
         [-0.7982, -0.6700, -0.9200, -0.1233,  0.6410, -0.3867],
         [ 0.4198,  0.8892,  0.4646,  0.0647, -0.9828, -0.9097]]],
       grad_fn=<TransposeBackward1>)
tensor([[[ 0.4198,  0.8892,  0.4646]],

        [[ 0.9711, -0.9796, -0.6105]]], grad_fn=<StackBackward0>)
