# LSTM的 API调用实现和 手写LSTM实现

class Torch.nn.LSTM()

### 公式

$$i_{t} = \sigma (W_{ii}x_{t}+b_{ii}+W_{hi}h_{t-1}+b_{hi})$$
$$f_{t} = \sigma (W_{if}x_{t}+b_{if}+W_{hf}h_{t-1}+b_{hf})$$
$$g_{t} = tanh(W_{ig}x_{t}+b_{ig}+W_{hg}h_{t−1}+b_{hg})$$
$$o_{t} = \sigma (W_{io}x_{t}+b_{io}+W_{ho}h{t−1}+b_{ho})$$
$$c_{t} = f_{t} \odot c_{t−1}+i_{t} \odot g_{t}$$
$$h_{t} = o_{t} \odot tanh(c_{t})$$

> 其中直接相乘是矩阵乘法，点乘是hadamard乘积

- inputs: input, (h_0, c_0) 两个涉及到更新迭代的变量都需要传入初始值，并且是一个元组的形式
- outputs: output, (h_n, c_n)

$$N = batch size$$
$$L = sequence length$$
$$D = 2  \enspace if \enspace bidirectional=True \enspace otherwise  \enspace 1$$
$$H_{in} = input_size$$
$$H_{cell} = hidden_size$$
$$H_{out} = \enspace proj_size \enspace if \enspace proj_size>0 \enspace otherwise \enspace hidden_size$$


In [15]:
# 实现LSTM和LSTMP的源码

import torch
import torch.nn as nn
# 首先定义一些常量
batch_size, sequence_length, input_size, hidden_size = 2, 3, 4, 5 
# batch_size 样本大小 sequence_length 词元大小 input_size 特征向量大小 hidden_size 细胞大小
# proj_size 大小待定
# 初始化一个正态分布的初始输入
input = torch.randn(batch_size, sequence_length, input_size) # 输入序列

# 生成c_0 和 h_0，假设我们只考虑一层LSTM网络
c_0 = torch.randn(batch_size, hidden_size) # 不参与训练，就是一个初始值
h_0 = torch.randn(batch_size, hidden_size) # 同样不参与训练

# 调用官方的API
lstm_layout = nn.LSTM(input_size, hidden_size, batch_first=True)

# 传入输入和状态
output, (h_n, c_n) = lstm_layout(input, (h_0.unsqueeze(0), c_0.unsqueeze(0))) # 初始化的h0和c0大小都是 N*H_out 所以需要在第一维扩容

print(output)
print((h_n, c_n))

tensor([[[ 0.0954, -0.3666, -0.7522,  0.5803, -0.2007],
         [ 0.3062, -0.1573, -0.4875,  0.3546, -0.0033],
         [ 0.2897, -0.1801, -0.3915,  0.3521,  0.0440]],

        [[ 0.2178,  0.2259, -0.3785,  0.0323, -0.0202],
         [-0.0134, -0.0893, -0.1599, -0.0622, -0.2748],
         [-0.0743, -0.0655, -0.2645, -0.1283, -0.0643]]],
       grad_fn=<TransposeBackward0>)
(tensor([[[ 0.2897, -0.1801, -0.3915,  0.3521,  0.0440],
         [-0.0743, -0.0655, -0.2645, -0.1283, -0.0643]]],
       grad_fn=<StackBackward0>), tensor([[[ 0.6266, -0.6600, -0.7663,  0.4914,  0.1441],
         [-0.2508, -0.1600, -0.3482, -0.1871, -0.1418]]],
       grad_fn=<StackBackward0>))


In [16]:
# 查看lstm_layout内部的权重和张量名字
for k,v in lstm_layout.named_parameters():
    print(k,v)

weight_ih_l0 Parameter containing:
tensor([[-0.3608,  0.4223,  0.3240,  0.4396],
        [ 0.2628, -0.0082, -0.1540, -0.4255],
        [ 0.0675,  0.2024, -0.3961,  0.3381],
        [ 0.0352,  0.2174,  0.4350,  0.1006],
        [ 0.3904, -0.0922, -0.2534, -0.4359],
        [ 0.3478,  0.1939, -0.2691,  0.3185],
        [-0.1673,  0.2237, -0.0575, -0.3648],
        [-0.2703, -0.3033,  0.0513,  0.2277],
        [ 0.2730,  0.0737, -0.0360,  0.2822],
        [-0.0256,  0.2518,  0.2062, -0.4149],
        [ 0.2545, -0.2313,  0.4291,  0.3674],
        [ 0.0853,  0.3815,  0.4442,  0.4279],
        [ 0.3721, -0.0452,  0.1054, -0.1018],
        [ 0.0199, -0.2720, -0.3964, -0.1091],
        [-0.0961, -0.0362, -0.0221,  0.2594],
        [ 0.0199, -0.2054,  0.3756, -0.0886],
        [-0.4058,  0.0785, -0.0532, -0.0588],
        [ 0.3822,  0.3860, -0.4194,  0.1636],
        [ 0.3558, -0.2027, -0.0279,  0.3890],
        [ 0.2146,  0.3917, -0.1237, -0.3683]], requires_grad=True)
weight_hh_l0 Parameter c

### 直接看这四个参数对应的shape大小

> weight_ih_l0 对应的是和x矩阵相乘的四个参数，把他们合并到了一起来
> weight_hh_l0 对应的是和h矩阵相乘的四个参数
> 两个bias原理同上，都是合并到了一起来

- weight_ih的四个参数，隐藏层的神经元是5，所以四个参数合在一起就是20
- 矩阵乘法，与输入向量相乘，权值矩阵的第一维是输出矩阵的特征维数，第二维是输入矩阵的
- weight_ih 是要和 input_size相乘的，所以input_size是4
- 同理 hh应该和 hidden_size相乘，所以是5(矩阵相乘后一维度)

In [13]:
# 手写LSTM模型
def lstm_forward(input, initial_states, w_ih, w_hh, b_ih, b_hh):
    h_0, c_0 = initial_states  # 初始状态的拆解 batch_size*hidden_size
    batch_size, sequence_length, input_size = input.shape # 输入的拆解
    hidden_size = w_ih.shape[0] // 4 # 第0维除以4就是拆解hidden_size
    output_size = hidden_size
    
    h_prev, c_prev = h_0, c_0 # 每一时刻的h和c在后面要进行迭代
    # 初始化output_size
    output = torch.zeros(batch_size, sequence_length, output_size) # 初始化输出序列
    
    # 确定维度 并且提高维度
    batch_w_ih = w_ih.unsqueeze(0).tile(batch_size, 1, 1) # batch_size*4*hidden_size*input_size  扩容batch_size
    batch_w_hh = w_hh.unsqueeze(0).tile(batch_size, 1, 1) # batch_size*4*hidden_size*hidden_size
    
    for i in range(sequence_length):
        x = input[:, i, :]  # 当前时刻的输入向量 维度 batch_size*input_size
        w_times_x = torch.bmm(batch_w_ih, x.unsqueeze(-1)).squeeze(-1)  #维度 batch_size*4*hidden_size*1 然后降维把1删掉
        w_times_h = torch.bmm(batch_w_hh, h_prev.unsqueeze(-1)).squeeze(-1)  #维度 batch_size*4*hidden_size*1 然后降维把1删掉
        
        
        # 开始分别计算输入门(i)，遗忘门(f)，cell门(g)，和输出门(o)
        i_times_t = torch.sigmoid(w_times_x[:, :, hidden_size] + w_times_h[:, :, hidden_size] + b_ih[:hidden_size], b_hh[:hidden_size])
        f_times_t = torch.sigmoid(w_times_x[:, :, hidden_size:2*hidden_size] + w_times_h[:, :, hidden_size:2*hidden_size] + \
                                  b_ih[hidden_size:2*hidden_size], b_hh[hidden_size:2*hidden_size])
        g_times_t = torch.tanh(w_times_x[:, :, 2*hidden_size:3*hidden_size] + w_times_h[:, :, 2*hidden_size:3*hidden_size] + \
                                  b_ih[2*hidden_size:3*hidden_size], b_hh[2*hidden_size:3*hidden_size])
        o_times_t = torch.sigmoid(w_times_x[:, :, 3*hidden_size:] + w_times_h[:, :, 3*hidden_size:] + \
                                  b_ih[3*hidden_size:], b_hh[3*hidden_size:])
        
        # 细胞状态c_t/c_prev
        c_prev = f_times_t * c_prev + i_times_t * g_times_t
        h_prev = o_times_t * torch.tanh(c_prev)
        
        output[:, i, :] = h_prev
        
    return output, (h_prev, c_prev)

In [None]:
# 验证正确性，将lstm_layout生成的张量代入进去
custom_output, (custom_h, custom_c) = lstm_forward(input, (h_0, c_0), lstm_layout.weight_ih_l0, lstm_layout.weight_hh_l0, lstm_layout.bias_ih_l0, lstm_layout.bias_hh_l0)