## 第六章 循环神经网络

### 1.本节的IMDB电影评论文本分类任务中，默认使用了单层的Paddle LSTM模型，请尝试使用叠加多层进行训练，观察其效果并与单层LSTM进行对比。<span style="color:red">(必修题)</span>

In [1]:
import os
# 加载数据集
def load_imdb_data(path):
    assert os.path.exists(path) 
    trainset, devset, testset = [], [], []
    with open(os.path.join(path, "train.txt"), "r") as fr:
        for line in fr:
            sentence_label, sentence = line.strip().lower().split("\t", maxsplit=1)
            trainset.append((sentence, sentence_label))

    with open(os.path.join(path, "dev.txt"), "r") as fr:
        for line in fr:
            sentence_label, sentence = line.strip().lower().split("\t", maxsplit=1)
            devset.append((sentence, sentence_label))

    with open(os.path.join(path, "test.txt"), "r") as fr:
        for line in fr:
            sentence_label, sentence = line.strip().lower().split("\t", maxsplit=1)
            testset.append((sentence, sentence_label))

    return trainset, devset, testset

# 加载IMDB数据集
train_data, dev_data, test_data = load_imdb_data("./dataset/") 
# 打印一下加载后的数据样式
print(train_data[4])

("the premise of an african-american female scrooge in the modern, struggling city was inspired, but nothing else in this film is. here, ms. scrooge is a miserly banker who takes advantage of the employees and customers in the largely poor and black neighborhood it inhabits. there is no doubt about the good intentions of the people involved. part of the problem is that story's roots don't translate well into the urban setting of this film, and the script fails to make the update work. also, the constant message about sharing and giving is repeated so endlessly, the audience becomes tired of it well before the movie reaches its familiar end. this is a message film that doesn't know when to quit. in the title role, the talented cicely tyson gives an overly uptight performance, and at times lines are difficult to understand. the charles dickens novel has been adapted so many times, it's a struggle to adapt it in a way that makes it fresh and relevant, in spite of its very relevant message

In [2]:
import paddle
import paddle.nn as nn
from paddle.io import Dataset
from utils.data import load_vocab

class IMDBDataset(Dataset):
    def __init__(self, examples, word2id_dict):
        super(IMDBDataset, self).__init__()
        # 词典，用于将单词转为字典索引的数字
        self.word2id_dict =  word2id_dict
        # 加载后的数据集
        self.examples = self.words_to_id(examples)

    def words_to_id(self, examples):
        tmp_examples = []
        for idx, example in enumerate(examples):
            seq, label = example
            # 将单词映射为字典索引的ID， 对于词典中没有的单词用[UNK]对应的ID进行替代
            seq = [self.word2id_dict.get(word, self.word2id_dict['[UNK]']) for word in seq.split(" ")]
            label = int(label)
            tmp_examples.append([seq, label])
        return tmp_examples

    def __getitem__(self, idx):
        seq, label = self.examples[idx]
        return seq, label

    def __len__(self):
        return len(self.examples)
    
# 加载词表
word2id_dict= load_vocab("./dataset/vocab.txt") 

# 实例化Dataset
train_set = IMDBDataset(train_data, word2id_dict)
dev_set = IMDBDataset(dev_data, word2id_dict)
test_set = IMDBDataset(test_data, word2id_dict)

print('训练集样本数：', len(train_set))
print('样本示例：', train_set[4])

训练集样本数： 25000
样本示例： ([2, 976, 5, 32, 6860, 618, 7673, 8, 2, 13073, 2525, 724, 14, 22837, 18, 164, 416, 8, 10, 24, 701, 611, 1743, 7673, 7, 3, 56391, 21652, 36, 271, 3495, 5, 2, 11373, 4, 13244, 8, 2, 2157, 350, 4, 328, 4118, 12, 48810, 52, 7, 60, 860, 43, 2, 56, 4393, 5, 2, 89, 4152, 182, 5, 2, 461, 7, 11, 7321, 7730, 86, 7931, 107, 72, 2, 2830, 1165, 5, 10, 151, 4, 2, 272, 1003, 6, 91, 2, 10491, 912, 826, 2, 1750, 889, 43, 6723, 4, 647, 7, 2535, 38, 39222, 2, 357, 398, 1505, 5, 12, 107, 179, 2, 20, 4279, 83, 1163, 692, 10, 7, 3, 889, 24, 11, 141, 118, 50, 6, 28642, 8, 2, 490, 1469, 2, 1039, 98975, 24541, 344, 32, 2074, 11852, 1683, 4, 29, 286, 478, 22, 823, 6, 5222, 2, 1490, 6893, 883, 41, 71, 3254, 38, 100, 1021, 44, 3, 1700, 6, 8768, 12, 8, 3, 108, 11, 146, 12, 1761, 4, 92295, 8, 2641, 5, 83, 49, 3866, 5352], 0)


In [3]:
from functools import partial

def collate_fn(batch_data, pad_val=0, max_seq_len=256):
    seqs, seq_lens, labels = [], [], []
    max_len = 0
    for example in batch_data:
        seq, label = example
        # 对数据序列进行截断
        seq = seq[:max_seq_len]
        # 对数据截断并保存于seqs中
        seqs.append(seq)
        seq_lens.append(len(seq))
        labels.append(label)
        # 保存序列最大长度
        max_len = max(max_len, len(seq))
    # 对数据序列进行填充至最大长度
    for i in range(len(seqs)):
        seqs[i] = seqs[i] + [pad_val] * (max_len - len(seqs[i]))

    return (paddle.to_tensor(seqs), paddle.to_tensor(seq_lens)), paddle.to_tensor(labels)


In [4]:
max_seq_len = 5
batch_data = [[[1, 2, 3, 4, 5, 6], 1], [[2,4,6], 0]]
(seqs, seq_lens), labels = collate_fn(batch_data, pad_val=word2id_dict["[PAD]"], max_seq_len=max_seq_len)
print("seqs: ", seqs)
print("seq_lens: ", seq_lens)
print("labels: ", labels)

seqs:  Tensor(shape=[2, 5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [[1, 2, 3, 4, 5],
        [2, 4, 6, 0, 0]])
seq_lens:  Tensor(shape=[2], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [5, 3])
labels:  Tensor(shape=[2], dtype=int64, place=Place(gpu:0), stop_gradient=True,
       [1, 0])


In [5]:
max_seq_len = 256
batch_size = 128
collate_fn = partial(collate_fn, pad_val=word2id_dict["[PAD]"], max_seq_len=max_seq_len)
train_loader = paddle.io.DataLoader(train_set, batch_size=batch_size, shuffle=True, drop_last=False, collate_fn=collate_fn)
dev_loader = paddle.io.DataLoader(dev_set, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn)
test_loader = paddle.io.DataLoader(test_set, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn)

In [6]:
class AveragePooling(nn.Layer):
    def __init__(self):
        super(AveragePooling, self).__init__()
    
    def forward(self, sequence_output, sequence_length):
        sequence_length = paddle.cast(sequence_length.unsqueeze(-1), dtype="float32")
        # 根据sequence_length生成mask矩阵，用于对Padding位置的信息进行mask
        max_len = sequence_output.shape[1]
        mask = paddle.arange(max_len) < sequence_length
        mask = paddle.cast(mask, dtype="float32").unsqueeze(-1)
        # 对序列中paddling部分进行mask
        sequence_output = paddle.multiply(sequence_output, mask)
        # 对序列中的向量取均值
        batch_mean_hidden = paddle.divide(paddle.sum(sequence_output, axis=1), sequence_length)
        return batch_mean_hidden

class Model_BiLSTM_FC(nn.Layer):
    def __init__(self, num_embeddings, input_size, hidden_size, num_classes=2):
        super(Model_BiLSTM_FC, self).__init__()
        # 词典大小
        self.num_embeddings = num_embeddings
        # 单词向量的维度
        self.input_size = input_size
        # LSTM隐藏单元数量
        self.hidden_size = hidden_size
        # 情感分类类别数量
        self.num_classes = num_classes
        # 实例化嵌入层
        self.embedding_layer = nn.Embedding(num_embeddings, input_size, padding_idx=0)
        # 实例化LSTM层
        self.lstm_layer = nn.LSTM(input_size, hidden_size, direction="forward")
        # 实例化聚合层
        self.average_layer = AveragePooling()
        # 实例化输出层
        self.output_layer = nn.Linear(hidden_size, num_classes)

    def forward(self, inputs):
        # 对模型输入拆分为序列数据和mask
        input_ids, sequence_length = inputs
        # 获取词向量
        inputs_emb = self.embedding_layer(input_ids)
        # 使用lstm处理数据
        sequence_output, _ = self.lstm_layer(inputs_emb, sequence_length=sequence_length)
        # 使用聚合层聚合sequence_output
        batch_mean_hidden = self.average_layer(sequence_output, sequence_length)
        # 输出文本分类logits
        logits = self.output_layer(batch_mean_hidden)
        return logits

In [7]:
import time
import random
import numpy as np
from nndl import Accuracy, RunnerV3

np.random.seed(0)
random.seed(0)
paddle.seed(0)

# 指定训练轮次
num_epochs = 3
# 指定学习率
learning_rate = 0.001
# 指定embedding的数量为词表长度
num_embeddings = len(word2id_dict)
# embedding向量的维度
input_size = 256
# LSTM网络隐状态向量的维度
hidden_size = 256

# 实例化模型
model = Model_BiLSTM_FC(num_embeddings, input_size, hidden_size)
# 指定优化器
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, beta1=0.9, beta2=0.999, parameters= model.parameters()) 
# 指定损失函数
loss_fn = paddle.nn.CrossEntropyLoss() 
# 指定评估指标
metric = Accuracy()
# 实例化Runner
runner = RunnerV3(model, optimizer, loss_fn, metric)
# 模型训练
start_time = time.time()
runner.train(train_loader, dev_loader, num_epochs=num_epochs, eval_steps=10, log_steps=10, save_path="./checkpoints/best_forward.pdparams")
end_time = time.time()
print("time: ", (end_time-start_time))

  from collections import MutableMapping
  from collections import Iterable, Mapping
  from collections import Sized
W0727 00:57:33.701588  9852 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0727 00:57:33.704583  9852 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.
  format(lhs_dtype, rhs_dtype, lhs_dtype))


[Train] epoch: 0/3, step: 0/588, loss: 0.69244
[Train] epoch: 0/3, step: 10/588, loss: 0.68635
[Evaluate]  dev score: 0.53960, dev loss: 0.68583
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.53960
[Train] epoch: 0/3, step: 20/588, loss: 0.64868
[Evaluate]  dev score: 0.64344, dev loss: 0.63686
[Evaluate] best accuracy performence has been updated: 0.53960 --> 0.64344
[Train] epoch: 0/3, step: 30/588, loss: 0.52741
[Evaluate]  dev score: 0.75936, dev loss: 0.55803
[Evaluate] best accuracy performence has been updated: 0.64344 --> 0.75936
[Train] epoch: 0/3, step: 40/588, loss: 0.50660
[Evaluate]  dev score: 0.79368, dev loss: 0.52721
[Evaluate] best accuracy performence has been updated: 0.75936 --> 0.79368
[Train] epoch: 0/3, step: 50/588, loss: 0.40474
[Evaluate]  dev score: 0.80776, dev loss: 0.42634
[Evaluate] best accuracy performence has been updated: 0.79368 --> 0.80776
[Train] epoch: 0/3, step: 60/588, loss: 0.30223
[Evaluate]  dev score: 0.80560, dev loss

In [8]:
model_path = "./checkpoints/best_self_forward.pdparams"
runner.load_model(model_path)
accuracy, _ =  runner.evaluate(test_loader)
print(f"Evaluate on test set, Accuracy: {accuracy:.5f}")



Evaluate on test set, Accuracy: 0.82880


In [9]:
class Model_MultiLayer_BiLSTM_FC(nn.Layer):
    def __init__(self, num_embeddings, input_size, hidden_size, num_classes=2):
        super(Model_MultiLayer_BiLSTM_FC, self).__init__()
        # 词典大小
        self.num_embeddings = num_embeddings
        # 单词向量的维度
        self.input_size = input_size
        # LSTM隐藏单元数量
        self.hidden_size = hidden_size
        # 情感分类类别数量
        self.num_classes = num_classes
        # 实例化嵌入层
        self.embedding_layer = nn.Embedding(num_embeddings, input_size, padding_idx=0)
        # 实例化LSTM层
        self.lstm_layer = nn.LSTM(input_size, hidden_size, num_layers=2, direction="forward")
        # 实例化聚合层
        self.average_layer = AveragePooling()
        # 实例化输出层
        self.output_layer = nn.Linear(hidden_size, num_classes)

    def forward(self, inputs):
        # 对模型输入拆分为序列数据和mask
        input_ids, sequence_length = inputs
        # 获取词向量
        inputs_emb = self.embedding_layer(input_ids)
        # 使用lstm处理数据
        sequence_output, _ = self.lstm_layer(inputs_emb, sequence_length=sequence_length)
        # 使用聚合层聚合sequence_output
        batch_mean_hidden = self.average_layer(sequence_output, sequence_length)
        # 输出文本分类logits
        logits = self.output_layer(batch_mean_hidden)
        return logits

In [10]:
np.random.seed(0)
random.seed(0)
paddle.seed(0)

# 指定训练轮次
num_epochs = 3
# 指定学习率
learning_rate = 0.001
# 指定embedding的数量为词表长度
num_embeddings = len(word2id_dict)
# embedding向量的维度
input_size = 256
# LSTM网络隐状态向量的维度
hidden_size = 256

# 实例化模型
model = Model_MultiLayer_BiLSTM_FC(num_embeddings, input_size, hidden_size)
# 指定优化器
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, beta1=0.9, beta2=0.999, parameters= model.parameters()) 
# 指定损失函数
loss_fn = paddle.nn.CrossEntropyLoss() 
# 指定评估指标
metric = Accuracy()
# 实例化Runner
runner = RunnerV3(model, optimizer, loss_fn, metric)
# 模型训练
start_time = time.time()
runner.train(train_loader, dev_loader, num_epochs=num_epochs, eval_steps=10, log_steps=10, save_path="./checkpoints/best_multilayer.pdparams")
end_time = time.time()
print("time: ", (end_time-start_time))

[Train] epoch: 0/3, step: 0/588, loss: 0.69207
[Train] epoch: 0/3, step: 10/588, loss: 0.69213
[Evaluate]  dev score: 0.49472, dev loss: 0.69272
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.49472
[Train] epoch: 0/3, step: 20/588, loss: 0.68635
[Evaluate]  dev score: 0.50888, dev loss: 0.68027
[Evaluate] best accuracy performence has been updated: 0.49472 --> 0.50888
[Train] epoch: 0/3, step: 30/588, loss: 0.44815
[Evaluate]  dev score: 0.75944, dev loss: 0.51048
[Evaluate] best accuracy performence has been updated: 0.50888 --> 0.75944
[Train] epoch: 0/3, step: 40/588, loss: 0.30420
[Evaluate]  dev score: 0.79784, dev loss: 0.46734
[Evaluate] best accuracy performence has been updated: 0.75944 --> 0.79784
[Train] epoch: 0/3, step: 50/588, loss: 0.35504
[Evaluate]  dev score: 0.78024, dev loss: 0.45525
[Train] epoch: 0/3, step: 60/588, loss: 0.27823
[Evaluate]  dev score: 0.83696, dev loss: 0.37768
[Evaluate] best accuracy performence has been updated: 0.79784 --

In [11]:
model_path = "./checkpoints/best_multilayer.pdparams"
runner.load_model(model_path)
accuracy, _ =  runner.evaluate(test_loader)
print(f"Evaluate on test set, Accuracy: {accuracy:.5f}")

Evaluate on test set, Accuracy: 0.85808


观察到单层LSTM在测试集上达到了**0.82880**的准确率，2层的LSTM达到了**0.85808**的准确率，堆叠层数一定程度上提高了模型的表现。

### 2.本节实现了单向的LSTM模型，请思考如何实现双向LSTM，并基于IMDB数据集进行文本分类任务。<span style="color:red">(附加题&加分题)</span>

In [12]:
class Model_BiDirect_BiLSTM_FC(nn.Layer):
    def __init__(self, num_embeddings, input_size, hidden_size, num_classes=2):
        super(Model_BiDirect_BiLSTM_FC, self).__init__()
        # 词典大小
        self.num_embeddings = num_embeddings
        # 单词向量的维度
        self.input_size = input_size
        # LSTM隐藏单元数量
        self.hidden_size = hidden_size
        # 情感分类类别数量
        self.num_classes = num_classes
        # 实例化嵌入层
        self.embedding_layer = nn.Embedding(num_embeddings, input_size, padding_idx=0)
        # 实例化LSTM层
        self.lstm_layer = nn.LSTM(input_size, hidden_size, direction="bidirectional")
        # 实例化聚合层
        self.average_layer = AveragePooling()
        # 实例化输出层
        self.output_layer = nn.Linear(hidden_size, num_classes)

    def forward(self, inputs):
        # 对模型输入拆分为序列数据和mask
        input_ids, sequence_length = inputs
        # 获取词向量
        inputs_emb = self.embedding_layer(input_ids)
        # 使用lstm处理数据
        sequence_output, _ = self.lstm_layer(inputs_emb, sequence_length=sequence_length)
        # 使用聚合层聚合sequence_output
        batch_mean_hidden = self.average_layer(sequence_output, sequence_length)
        # 输出文本分类logits
        logits = self.output_layer(batch_mean_hidden)
        return logits

In [13]:
np.random.seed(0)
random.seed(0)
paddle.seed(0)

# 指定训练轮次
num_epochs = 3
# 指定学习率
learning_rate = 0.001
# 指定embedding的数量为词表长度
num_embeddings = len(word2id_dict)
# embedding向量的维度
input_size = 256
# LSTM网络隐状态向量的维度
hidden_size = 256

# 实例化模型
model = Model_MultiLayer_BiLSTM_FC(num_embeddings, input_size, hidden_size)
# 指定优化器
optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, beta1=0.9, beta2=0.999, parameters= model.parameters()) 
# 指定损失函数
loss_fn = paddle.nn.CrossEntropyLoss() 
# 指定评估指标
metric = Accuracy()
# 实例化Runner
runner = RunnerV3(model, optimizer, loss_fn, metric)
# 模型训练
start_time = time.time()
runner.train(train_loader, dev_loader, num_epochs=num_epochs, eval_steps=10, log_steps=10, save_path="./checkpoints/best_bidirect.pdparams")
end_time = time.time()
print("time: ", (end_time-start_time))

[Train] epoch: 0/3, step: 0/588, loss: 0.69207
[Train] epoch: 0/3, step: 10/588, loss: 0.69213
[Evaluate]  dev score: 0.49472, dev loss: 0.69272
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.49472
[Train] epoch: 0/3, step: 20/588, loss: 0.68635
[Evaluate]  dev score: 0.50888, dev loss: 0.68027
[Evaluate] best accuracy performence has been updated: 0.49472 --> 0.50888
[Train] epoch: 0/3, step: 30/588, loss: 0.44815
[Evaluate]  dev score: 0.75936, dev loss: 0.51049
[Evaluate] best accuracy performence has been updated: 0.50888 --> 0.75936
[Train] epoch: 0/3, step: 40/588, loss: 0.30420
[Evaluate]  dev score: 0.79784, dev loss: 0.46735
[Evaluate] best accuracy performence has been updated: 0.75936 --> 0.79784
[Train] epoch: 0/3, step: 50/588, loss: 0.35505
[Evaluate]  dev score: 0.78016, dev loss: 0.45528
[Train] epoch: 0/3, step: 60/588, loss: 0.27827
[Evaluate]  dev score: 0.83672, dev loss: 0.37772
[Evaluate] best accuracy performence has been updated: 0.79784 --

In [14]:
model_path = "./checkpoints/best_bidirect.pdparams"
runner.load_model(model_path)
accuracy, _ =  runner.evaluate(test_loader)
print(f"Evaluate on test set, Accuracy: {accuracy:.5f}")

Evaluate on test set, Accuracy: 0.85816


观察到单层LSTM在测试集上达到了**0.82880**的准确率，2层的LSTM达到了**0.85808**的准确率，双向单层LSTM则达到了**0.85816**的准确率，比2层LSTM略高。

### 3. <span style="color:red">(附加题&简答题&加分题)</span>
    小明刚刚学习了循环神经网络，觉得这个网络非常有实际用途。正巧这几天天气反复多变，气温忽冷忽热。因此小明想是否可以用LSTM去预测什么时候天气能够持续回暖。
    小明在网上搜索，找到了往前三年的气温数据，气温数据每个1小时就会被记录一次。 因此小明基于这份数据，使用LSTM进行建模，预测后续的气温情况。
     小明发现通过设置数据步长为2，即使用前1小时的气温预测下1小时的气温，能够获得非常高的准确度。因此小明觉得模型已经训练得非常好了。
    因此，他想把上一个时刻预测出的气温，作为下一个时刻的输入，依次向前预测，直到预测出往后3天的气温。 
    大家觉得，小明能够准确获得第3天的气温吗？为什么？


不一定。
使用前1小时的气温预测下1小时的气温，即是使用teacher forcing的方式训练，但从当前时刻开始，每次的输入都是上一时间步预测出的输出，依次往下预测会不可避免地出现错误传播，所以我认为不太可能准确获得第3天的气温。
