In [1]:
# 定义一个数据加载的函数load_data，用于数据生成和格式转换。
import paddle
import random
import numpy as np

def load_data(num_steps=10):
    heads=[[1,2],[2,4],[3,2],[3,1]]
    labels = [[3], [6], [5], [4]]
    # 装配数据
    samples = []
    for idx, head in enumerate(heads):
        seq = [head+[0]*(num_steps-len(heads[0]))]
        seq = paddle.to_tensor(seq, dtype="int64")
        label = paddle.to_tensor(labels[idx], dtype="float32")
        yield seq, label


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def convert_to_list(value, n, name, dtype=np.int):


In [2]:
# 一、简单RNN网络搭建
# 简单RNN网络的代码实现如下，这里只保留了最后一个时刻的RNN输出向量，用于完成接下来的数字预测实验。

import paddle
import paddle.nn.functional as F

# 声明RNN网络和相关参数
class SelfRNN(paddle.nn.Layer):
    def __init__(self, emb_size, hidden_size):
        super(SelfRNN, self).__init__()
        self.emb_size = emb_size
        self.hidden_size = hidden_size
        self.W = paddle.create_parameter(shape=[emb_size, hidden_size], dtype="float32")
        self.U = paddle.create_parameter(shape=[hidden_size, hidden_size], dtype="float32")
        self.b = paddle.create_parameter(shape=[1, hidden_size], dtype="float32")

    # 定义前向计算
    def forward(self, inputs):
        # inputs batch_size x seq_len x emb_dim
        batch_size, seq_len, emb_dim = inputs.shape

        # 初始化向量
        hidden_state = paddle.zeros(shape=[batch_size, self.hidden_size], dtype="float32")
        # 执行RNN计算
        for step in range(seq_len):
            step_input = inputs[:, step, :]
            hidden_state = F.tanh(paddle.matmul(step_input, self.W) + paddle.matmul(hidden_state, self.U) + self.b)
        return hidden_state

In [3]:
# 定义数据预测模型
# 定义一个数字预测模型NumericPrediction，基于RNN网络处理数字序列，并使用最后时刻的状态向量进行数字标签预测。

# 模型定义
class NumericPrediction(paddle.nn.Layer):
    def __init__(self, vocab_size, emb_size, hidden_size, model_type="RNN"):
        super(NumericPrediction, self).__init__()
        self.model_type = model_type
        self.model = SelfRNN(emb_size, hidden_size) if model_type == "RNN" else SelfLSTM(emb_size, hidden_size)
        self.embedding = paddle.nn.Embedding(vocab_size, emb_size)
        self.cls_fc = paddle.nn.Linear(hidden_size, 1)

    def forward(self, inputs):
        inputs_emb = self.embedding(inputs)

        state = self.model(inputs_emb)
        hidden_state = state if self.model_type == "RNN" else state[1]

        logits = self.cls_fc(hidden_state)

        return logits, hidden_state

In [4]:
# 训练配置
# 配置模型参数（如：训练轮次、学习率等）、训练资源、实例化模型并指定优化器。
# 学习率是优化器的一个参数，代表参数更新幅度的大小，即步长。当学习率最优时，模型的有效容量最大，最终能达到的效果最好。
# SGD是比较成熟的优化算法之一，每次训练少量数据，基于这部分数据计算梯度和损失来更新参数。

# 训练配置
paddle.seed(0)
np.random.seed(0)
random.seed(0)

# 设置模型参数
epochs = 5
learning_rate = 0.05
batch_size = 1
num_steps = 10

vocab_size = 10
emb_size = 128
hidden_size = 128

# 指定训练资源
use_gpu = True if paddle.get_device().startswith("gpu") else False
if use_gpu:
    paddle.set_device('gpu:0')

# 实例化模型
model = NumericPrediction(vocab_size, emb_size, hidden_size, model_type="RNN")

# 指定优化器
optimizer = paddle.optimizer.SGD(learning_rate=learning_rate, parameters=model.parameters())

In [5]:
# 模型训练
# 在训练过程中，每轮迭代打印一次训练结果，观察Loss和RNN中参数W、U和b的梯度信息。
# 为了方便观察模型训练效果，计算梯度矩阵的L2范数进行展示，L2范数越大，代表梯度矩阵中的值也越大，越倾向产生梯度爆炸。

from numpy.linalg import norm

def output_grads_l2(model, hidden_state):

    W_grad_l2, U_grad_l2, b_grad_l2 = 0, 0, 0
    for name, param in model.named_parameters(): 
        if name == "model.W":  
            W_grad_l2 = norm(param.grad)
        if name == "model.U": 
            U_grad_l2 = norm(param.grad)
        if name == "model.b": 
            b_grad_l2 = norm(param.grad)

    return W_grad_l2, U_grad_l2, b_grad_l2


# 开始训练
def train(model, logging_steps=1):
    model.train()
    print(type(model.model))
    global_step = 0

    for epoch in range(1, epochs+1):
        total_count = 0
        correct_count = 0
        for step, batch in enumerate(load_data(num_steps=num_steps)):
            global_step += 1
            batch_seq, batch_label = batch
            predicts, hidden_state = model(batch_seq)
            loss = F.mse_loss(predicts, batch_label)

            loss.backward()

            if global_step % logging_steps ==0:
                print("=========epoch: %d, step: %d, loss: %.5f========" % (epoch, step, loss))
                W_grad_l2, U_grad_l2, b_grad_l2 = output_grads_l2(model, hidden_state)
                print("W_grad_l2: %f, U_grad_l2: %f, b_grad_l2: %f " % (W_grad_l2, U_grad_l2, b_grad_l2))

            optimizer.step()
            optimizer.clear_grad()

            with paddle.no_grad():
                # 检验是否预测正确
                predicts = predicts.squeeze(1)
                diff = (predicts - batch_label).abs()
                correct_count += paddle.cast(diff<0.1, "int64").sum().numpy()[0]
                total_count += len(batch_seq)

        if global_step % logging_steps ==0:
            acc = correct_count/total_count
            print("\ncorrect/total:%d/%d, Accuracy: %.5f \n" % (correct_count, total_count, acc))


train(model, logging_steps=1)

<class '__main__.SelfRNN'>
W_grad_l2: 19.658676, U_grad_l2: 47.573059, b_grad_l2: 13.377793 
W_grad_l2: 15.004972, U_grad_l2: 78.789055, b_grad_l2: 9.303792 
W_grad_l2: 263.725128, U_grad_l2: 948.933655, b_grad_l2: 152.599915 
W_grad_l2: 5637.591797, U_grad_l2: 2853.397949, b_grad_l2: 655.646729 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 34280.796875, U_grad_l2: 27940.976562, b_grad_l2: 2501.016113 
W_grad_l2: 130793.921875, U_grad_l2: 8154.219727, b_grad_l2: 720.770386 
W_grad_l2: 7621075.500000, U_grad_l2: 427715.187500, b_grad_l2: 37806.281250 
W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 
W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 
W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 
W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data.dtype == np.object:


In [6]:
# RNN网络优化：梯度截断
# 针对RNN梯度爆炸的情况，可以采用梯度截断的方式缓解，当梯度达到一定阈值时，对其进行截断。
# 一般截断有两种方式：按值截断和按模截断。
# 本实验采用按模截断的方式，使用 ClipGradByGlobalNorm API。在代码实现时，将ClipGradByNorm传入优化器，优化器在反向迭代过程中，每次梯度更新时便可以梯度裁剪。

# 训练配置
paddle.seed(0)
np.random.seed(0)
random.seed(0)

# 设置模型参数
epochs = 500
learning_rate = 0.05
batch_size = 1
num_steps = 10

vocab_size = 10
emb_size = 128
hidden_size = 128

# 指定训练资源
use_gpu = True if paddle.get_device().startswith("gpu") else False
if use_gpu:
    paddle.set_device('gpu:0')

# 实例化模型
model = NumericPrediction(vocab_size, emb_size, hidden_size, model_type="RNN")

# 指定优化器
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
optimizer = paddle.optimizer.SGD(learning_rate=learning_rate, parameters=model.parameters(), weight_decay=0.,grad_clip=clip)

#训练模型
train(model, logging_steps=20)

<class '__main__.SelfRNN'>
W_grad_l2: 1.107595, U_grad_l2: 4.565491, b_grad_l2: 0.695140 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 1.322490, U_grad_l2: 4.772387, b_grad_l2: 0.771956 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.813176, U_grad_l2: 2.182861, b_grad_l2: 0.439725 

correct/total:2/4, Accuracy: 0.50000 

W_grad_l2: 0.733498, U_grad_l2: 1.932203, b_grad_l2: 0.363847 

correct/total:2/4, Accuracy: 0.50000 

W_grad_l2: 4.601934, U_grad_l2: 14.251498, b_grad_l2: 2.300374 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 1.674462, U_grad_l2: 4.972064, b_grad_l2: 0.842681 

correct/total:2/4, Accuracy: 0.50000 

W_grad_l2: 0.791536, U_grad_l2: 2.609462, b_grad_l2: 0.438870 

correct/total:2/4, Accuracy: 0.50000 

W_grad_l2: 2.804833, U_grad_l2: 9.227398, b_grad_l2: 1.507882 

correct/total:1/4, Accuracy: 0.25000 

W_grad_l2: 3.689042, U_grad_l2: 12.382934, b_grad_l2: 2.011355 

correct/total:1/4, Accuracy: 0.25000 

W_grad_l2: 0.147262, U_grad_l2: 0.521812, b_

In [7]:
# LSTM网络搭建
# LSTM网络的代码实现与RNN结构相似，只是在RNN的基础上增加了隐藏门、输入门、遗忘门的定义和计算。这里依然选择保留序列的最后一个单词位置的输出向量。

# 导入paddle
import paddle
import paddle.nn.functional as F

# 声明LSTM网络和相关参数
class SelfLSTM(paddle.nn.Layer):
    def __init__(self, emb_dim, hidden_size):
        super(SelfLSTM, self).__init__()
        self.emb_dim = emb_dim
        self.hidden_size = hidden_size
        self.w_i = paddle.create_parameter(shape=[emb_dim, hidden_size], dtype="float32")
        self.w_f = paddle.create_parameter(shape=[emb_dim, hidden_size], dtype="float32")
        self.w_o = paddle.create_parameter(shape=[emb_dim, hidden_size], dtype="float32")
        self.w_a = paddle.create_parameter(shape=[emb_dim, hidden_size], dtype="float32")
        self.u_i = paddle.create_parameter(shape=[hidden_size, hidden_size], dtype="float32")
        self.u_f = paddle.create_parameter(shape=[hidden_size, hidden_size], dtype="float32")
        self.u_o = paddle.create_parameter(shape=[hidden_size, hidden_size], dtype="float32")
        self.u_a = paddle.create_parameter(shape=[hidden_size, hidden_size], dtype="float32")
        self.b_i = paddle.create_parameter(shape=[1, hidden_size], dtype="float32")
        self.b_f = paddle.create_parameter(shape=[1, hidden_size], dtype="float32")
        self.b_o = paddle.create_parameter(shape=[1, hidden_size], dtype="float32")
        self.b_a = paddle.create_parameter(shape=[1, hidden_size], dtype="float32")

    # 定义前向计算
    def forward(self, inputs):
        # inputs batch_size x seq_len x emb_dim
        batch_size, seq_len, emb_dim = inputs.shape

        # 初始化状态向量和隐状态向量
        cell_state = paddle.zeros(shape=[batch_size, self.hidden_size], dtype="float32")
        hidden_state = paddle.zeros(shape=[batch_size, self.hidden_size], dtype="float32")

        # 执行LSTM计算，包括：隐藏门、输入门、遗忘门、候选状态向量、状态向量和隐状态向量
        for step in range(seq_len):
            input_step = inputs[:, step, :]
            i = F.sigmoid(paddle.matmul(input_step, self.w_i) + paddle.matmul(hidden_state, self.u_i) + self.b_i)
            f = F.sigmoid(paddle.matmul(input_step, self.w_f) + paddle.matmul(hidden_state, self.u_f) + self.b_f)
            o = F.sigmoid(paddle.matmul(input_step, self.w_o) + paddle.matmul(hidden_state, self.u_o) + self.b_o)
            c_tilde = F.tanh(paddle.matmul(input_step, self.w_a) + paddle.matmul(hidden_state, self.u_a) + self.b_a)
            cell_state = f * cell_state + i * c_tilde
            hidden_state = o * F.tanh(cell_state)

        return cell_state, hidden_state

In [8]:
# 基于LSTM网络进行数字预测
# 实验过程中尽量复用RNN的参数配置，模型实例化方面需要将数字预测模型NumericPrediction中的model_type设置为LSTM，进行模型训练和评估。
# 在训练过程中统计模型预测的准确率，假如模型预测的数值和原本标签数据的差值在[-0.1,0.1]之内，则认为预测正确，否则认为预测错误。

# 训练配置
paddle.seed(0)
np.random.seed(0)
random.seed(0)

# 设置模型参数
epochs = 500
learning_rate = 0.1
batch_size = 1
num_steps = 10

vocab_size = 10
emb_size = 128
hidden_size = 128

# 指定训练资源
use_gpu = True if paddle.get_device().startswith("gpu") else False
if use_gpu:
    paddle.set_device('gpu:0')

# 实例化模型
model = NumericPrediction(vocab_size, emb_size, hidden_size, model_type="LSTM")

# 指定优化器
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
optimizer = paddle.optimizer.SGD(learning_rate=learning_rate, parameters=model.parameters(), grad_clip=clip)

train(model, logging_steps=80)

<class '__main__.SelfLSTM'>
W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:1/4, Accuracy: 0.25000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:1/4, Accuracy: 0.25000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_grad_l2: 0.000000 

correct/total:0/4, Accuracy: 0.00000 

W_grad_l2: 0.000000, U_grad_l2: 0.000000, b_g