# 背景
给定一个长句子预测下一个单词

# 导包

In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 准备数据

In [2]:
sentence = (
    'GitHub Actions makes it easy to automate all your software workflows '
    'from continuous integration and delivery to issue triage and more'
)
# 其实就是个字符串，就是将上下两行字符串连接在一起的一个大字符串

word2idx = {w: i for i, w in enumerate(list(set(sentence.split())))}
idx2word = {i: w for i, w in enumerate(list(set(sentence.split())))}
n_class = len(word2idx) # classification problem
max_len = len(sentence.split())
n_hidden = 5
batch_size = 3

# 数据预处理

In [3]:
def make_data(sentence):
    input_batch = []
    target_batch = []

    words = sentence.split()
    for i in range(max_len - 1):
        input = [word2idx[n] for n in words[:(i + 1)]]
        input = input + [0] * (max_len - len(input)) # 用0填充，保证一样长
        target = word2idx[words[i + 1]]
        input_batch.append(np.eye(n_class)[input])
        target_batch.append(target)

    return torch.Tensor(input_batch), torch.LongTensor(target_batch)

# input_batch: [max_len - 1, max_len, n_class]
input_batch, target_batch = make_data(sentence)
dataset = Data.TensorDataset(input_batch, target_batch)
loader = Data.DataLoader(dataset, batch_size, True)

# 定义网络架构
## 构造
- input_size – 输入数据的大小，也就是前面例子中每个单词向量的长度
- hidden_size – 隐藏层的大小（即隐藏层节点数量），输出向量的维度等于隐藏节点数
- num_layers – recurrent layer的数量，默认等于1。
- bias – 网络是否设置偏置，默认是True.
- batch_first – 默认为False，也就是说官方不推荐我们把batch放在第一维，这个CNN有点不同，此时输入输出的各个维度含义为 (seq_length,batch,feature)。当然如果你想和CNN一样把batch放在第一维，可将该参数设置为True。
- dropout – 如果非0，就在除了最后一层的其它层都插入Dropout层，默认为0。
- bidirectional – If True, becomes a bidirectional LSTM. Default: False

## 输入
input, (h_0,c_0)
- input: 输入数据，即上面例子中的一个句子（或者一个batch的句子），其维度形状为 (seq_len, batch, input_size)
  - seq_len: 句子长度，即单词数量，这个是需要固定的。当然假如你的一个句子中只有2个单词，但是要求输入10个单词，这个时候可以用torch.nn.utils.rnn.pack_padded_sequence()或者torch.nn.utils.rnn.pack_sequence()来对句子进行填充或者截断。
  - batch：就是你一次传入的句子的数量
  - input_size: 每个单词向量的长度，这个必须和你前面定义的网络结构保持一致
- h_0：维度形状为 (num_layers * num_directions, batch, hidden_size):
  - 结合下图应该比较好理解第一个参数的含义num_layers * num_directions， 即LSTM的层数乘以方向数量。这个方向数量是由前面介绍的bidirectional决定，如果为False,则等于1；反之等于2。
  - batch：同上
  - hidden_size: 隐藏层节点数
- c_0： 维度形状为 (num_layers * num_directions, batch, hidden_size),各参数含义和h_0类似。

当然，如果你没有传入(h_0, c_0)，那么这两个参数会默认设置为0。

## 输出
output, (h_n,c_n)
- output： 维度和输入数据类似，只不过最后的feature部分会有点不同，即 (seq_len, batch, num_directions * hidden_size)这个输出tensor包含了LSTM模型最后一层每个time step的输出特征另外如果前面你对输入数据使用了torch.nn.utils.rnn.PackedSequence,那么输出也会做同样的操作编程packed sequence。对于unpacked情况，我们可以对输出做如下处理来对方向作分离output.view(seq_len, batch, num_directions, hidden_size), 其中前向和后向分别用0和1表示Similarly, the directions can be separated in the packed case.
- h_n：(num_layers * num_directions, batch, hidden_size)，
只会输出最后个time step的隐状态结果
- c_n ：(num_layers * num_directions, batch, hidden_size)，只会输出最后个time step的cell状态结果（如下图所示）。


In [4]:
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=n_class, hidden_size=n_hidden, bidirectional=True) # 是否双向
        # fc
        self.fc = nn.Linear(n_hidden * 2, n_class) # *2因为双向 

    def forward(self, X):
        # X: [batch_size, max_len, n_class]
        batch_size = X.shape[0]
        input = X.transpose(0, 1)  # input : [max_len, batch_size, n_class]

        hidden_state = torch.randn(1*2, batch_size, n_hidden).to(device)   # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = torch.randn(1*2, batch_size, n_hidden).to(device)     # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        outputs, (_, _) = self.lstm(input, (hidden_state, cell_state))
        outputs = outputs[-1]  # [batch_size, n_hidden * 2]
        model = self.fc(outputs)  # model : [batch_size, n_class]
        return model

model = BiLSTM().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练

In [5]:
# Training
for epoch in range(10000):
    for x, y in loader:
      pred = model(x.to(device))
      loss = criterion(pred, y.to(device))
      if (epoch + 1) % 2500 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Epoch: 2500 cost = 0.821938
Epoch: 2500 cost = 0.222033
Epoch: 2500 cost = 1.067995
Epoch: 2500 cost = 1.481362
Epoch: 2500 cost = 0.987236
Epoch: 2500 cost = 0.218648
Epoch: 2500 cost = 1.267750
Epoch: 5000 cost = 0.742563
Epoch: 5000 cost = 0.165926
Epoch: 5000 cost = 0.082179
Epoch: 5000 cost = 0.102131
Epoch: 5000 cost = 0.344776
Epoch: 5000 cost = 0.170287
Epoch: 5000 cost = 0.237099
Epoch: 7500 cost = 0.010331
Epoch: 7500 cost = 0.248058
Epoch: 7500 cost = 0.016673
Epoch: 7500 cost = 0.021382
Epoch: 7500 cost = 0.018066
Epoch: 7500 cost = 0.255733
Epoch: 7500 cost = 0.020288
Epoch: 10000 cost = 0.004539
Epoch: 10000 cost = 0.233148
Epoch: 10000 cost = 0.001186
Epoch: 10000 cost = 0.241349
Epoch: 10000 cost = 0.003039
Epoch: 10000 cost = 0.005206
Epoch: 10000 cost = 0.001360


# 测试

In [6]:
# Pred
predict = model(input_batch.to(device)).data.max(1, keepdim=True)[1]
print(sentence)
print([idx2word[n.item()] for n in predict.squeeze()])

GitHub Actions makes it easy to automate all your software workflows from continuous integration and delivery to issue triage and more
['Actions', 'makes', 'it', 'easy', 'to', 'automate', 'all', 'your', 'software', 'workflows', 'from', 'continuous', 'integration', 'and', 'delivery', 'to', 'issue', 'triage', 'and', 'more']
