## 第八章 注意力机制

### 1.Transformer层数设置为多层，采用随机初始化位置编码，并且设置位置编码为可训练的方式，并进行文本分类实验，比较实验结果。<span style="color:red">(必修题)</span>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from data import load_vocab,load_imdb_data
from paddle.optimizer import Adam
from nndl import Accuracy, RunnerV3
import time
from nndl import IMDBDataset
from functools import partial
import paddle
import paddle.nn as nn
import numpy as np
from paddle.io import DataLoader

train_data, dev_data, test_data = load_imdb_data("./dataset") # 加载IMDB数据集和word2id词典
word2id_dict= load_vocab("dataset/vocab.txt") # 加载词典
padding_idx=word2id_dict['[pad]']
batch_size = 128
max_seq_len = 128
train_set = IMDBDataset(train_data, word2id_dict, max_seq_len)
dev_set = IMDBDataset(dev_data, word2id_dict, max_seq_len)
test_set = IMDBDataset(test_data, word2id_dict, max_seq_len)


def collate_fn(batch_data, pad_val=1):
    seqs, labels, lens = [], [], []
    for seq, label in batch_data:
        seqs.append(seq)
        labels.append([label])
        lens.append(len(seq))
    
    max_len = max(lens)
    for i in range(len(seqs)):
        seqs[i] = seqs[i] + [pad_val] * (max_len - len(seqs[i]))
    
    return (paddle.to_tensor(seqs), paddle.to_tensor(lens)),paddle.to_tensor(labels)


train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn)
dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn)

class WordEmbedding(nn.Layer):
    def __init__(self, vocab_size, emb_size, padding_idx=0):
        super(WordEmbedding, self).__init__()
        # Embedding的维度
        self.emb_size = emb_size
        # 使用随机正态（高斯）分布初始化 embedding
        self.word_embedding = nn.Embedding(vocab_size, emb_size,
            padding_idx=padding_idx, weight_attr=paddle.ParamAttr(
                initializer=nn.initializer.Normal(0.0, emb_size ** -0.5) ), )

    def forward(self, word):
        word_emb = self.emb_size ** 0.5 * self.word_embedding(word)
        return word_emb

class SegmentEmbedding(nn.Layer):
    def __init__(self, vocab_size, emb_size):
        super(SegmentEmbedding, self).__init__()
        # Embedding的维度
        self.emb_size = emb_size
        # 分段编码
        self.seg_embedding = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=emb_size
        )

    def forward(self, word):
        seg_embedding = self.seg_embedding(word)
        return seg_embedding

def get_sinusoid_encoding(position_size, hidden_size):
    """位置编码 """

    def cal_angle(pos, hidden_idx):
        # 公式里的 i = hid_idx // 2
        return pos / np.power(10000, 2 * (hidden_idx // 2) / hidden_size)

    def get_posi_angle_vec(pos):
        return [cal_angle(pos, hidden_j) for hidden_j in range(hidden_size)]

    sinusoid = np.array([get_posi_angle_vec(pos_i) for pos_i in range(position_size)])
    # dim 2i  偶数正弦
    # 从0开始，每隔2间隔求正弦值
    sinusoid[:, 0::2] = np.sin(sinusoid[:, 0::2])
    # dim 2i 1  奇数余弦
    # 从1开始，每隔2间隔取余弦
    sinusoid[:, 1::2] = np.cos(sinusoid[:, 1::2])
    # position_size × hidden_size  得到每一个词的位置向量
    return sinusoid.astype("float32")

class PositionalEmbedding(nn.Layer):
    def __init__(self, max_length,emb_size,stop_gradient=True, random_pos=False):
        super(PositionalEmbedding, self).__init__()
        self.emb_size = emb_size
        self.stop_gradient = stop_gradient
        if random_pos:
            self.pos_encoder = nn.Embedding(
                num_embeddings=max_length,
                embedding_dim=self.emb_size,
                weight_attr=paddle.ParamAttr(
                initializer=nn.initializer.Normal(0.0, emb_size ** -0.5)))
        else:
            self.pos_encoder = nn.Embedding(
                num_embeddings=max_length,
                embedding_dim=self.emb_size,
                weight_attr=paddle.ParamAttr(
                    initializer=paddle.nn.initializer.Assign(
                        get_sinusoid_encoding(max_length, self.emb_size))))
    
    def forward(self, pos, stop_gradient = True):
        pos_emb = self.pos_encoder(pos)
        # 关闭位置编码的梯度更新
        pos_emb.stop_gradient = stop_gradient
        return pos_emb


class TransformerEmbeddings(nn.Layer):
    """
    包括输入编码，分段编码，位置编码
    """
    def __init__(
        self,
        vocab_size,
        hidden_size=768,
        hidden_dropout_prob=0.1,
        position_size=512,
        segment_size=256,
        pos_stop_gradient=True,
        random_pos=False
    ):
        super(TransformerEmbeddings, self).__init__()
        # 输入编码向量
        self.word_embeddings = WordEmbedding(vocab_size, hidden_size)
        # 位置编码向量
        self.pos_stop_gradient = pos_stop_gradient
        self.position_embeddings = PositionalEmbedding(position_size, hidden_size, stop_gradient=self.pos_stop_gradient, random_pos=random_pos)
        # 分段编码
        self.segment_embeddings = SegmentEmbedding(segment_size, hidden_size)
        # 层规范化
        self.layer_norm = nn.LayerNorm(hidden_size)
        # Dropout操作
        self.dropout = nn.Dropout(hidden_dropout_prob)

    def forward(self, input_ids, segment_ids = None, position_ids = None):
        if position_ids is None:
            # 初始化全1的向量，比如[1,1,1,1]
            ones = paddle.ones_like(input_ids, dtype="int64")
            # 累加输入,求出序列前K个的长度,比如[1,2,3,4]
            seq_length = paddle.cumsum(ones, axis=-1)
            # position id的形式： 比如[0,1,2,3]
            position_ids = seq_length - ones
            position_ids.stop_gradient = self.pos_stop_gradient
        # 输入编码
        input_embedings = self.word_embeddings(input_ids)
        # 分段编码
        segment_embeddings = self.segment_embeddings(segment_ids)
        # 位置编码
        position_embeddings = self.position_embeddings(position_ids)
        # 输入张量, 分段张量，位置张量进行叠加
        # print(input_embedings.shape, segment_embeddings.shape, position_embeddings.shape)
        input_embedings = paddle.transpose(input_embedings, perm=[1, 0, 2])
        position_embeddings = paddle.transpose(position_embeddings, perm=[1, 0, 2])
        embeddings = input_embedings + segment_embeddings + position_embeddings
        embeddings = paddle.transpose(embeddings, perm=[1, 0, 2])

        # 层规范化
        embeddings = self.layer_norm(embeddings)
        # Dropout
        embeddings = self.dropout(embeddings)
        return embeddings



class Model_Transformer_v1(nn.Layer):
    def __init__(
        self,
        vocab_size,
        n_block=1,
        hidden_size=768,
        heads_num=12,
        intermediate_size=3072,
        hidden_dropout=0.1,
        attention_dropout=0.1,
        act_dropout=0,
        position_size=512,
        num_classes=2,
        padding_idx=0,
        pos_stop_gradient=True,
        random_pos=False
    ):
        super(Model_Transformer_v1, self).__init__()
        # 词表大小
        self.vocab_size = vocab_size
        # Transformer的编码器的数目
        self.n_block = n_block
        # 每个词映射成稠密向量的维度
        self.hidden_size = hidden_size
        # 多头注意力的个数
        self.heads_num = heads_num
        # 逐位前馈层的的维度
        self.intermediate_size = intermediate_size
        # Embedding层的 Dropout
        self.hidden_dropout = hidden_dropout
        # 多头注意力的dropout的 dropout参数
        self.attention_dropout = attention_dropout
        # 位置编码的大小 position_size
        self.position_size = position_size
        # 类别数
        self.num_classes = num_classes
        # 逐位前馈层的dropout
        self.act_dropout = act_dropout
        # [PAD]字符的ID
        self.padding_idx = padding_idx
        # 实例化输入编码，分段编码和位置编码
        self.embeddings = TransformerEmbeddings(
            self.vocab_size, self.hidden_size, self.hidden_dropout, self.position_size, pos_stop_gradient=pos_stop_gradient, random_pos=random_pos)
        # 实例化Transformer的编码器
        self.layers = nn.LayerList([])
        for i in range(n_block):
            # 使用框架API
            encoder_layer = nn.TransformerEncoderLayer(hidden_size, 
                                                    heads_num, 
                                                    intermediate_size,
                                                    dropout=hidden_dropout,
                                                    attn_dropout=attention_dropout,
                                                    act_dropout=act_dropout)
            self.layers.append(encoder_layer)
        # 全连接层
        self.dense = nn.Linear(hidden_size, hidden_size)
        # 双曲正切激活函数
        self.activation = nn.Tanh()
        # 最后一层分类器
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, inputs, position_ids=None, attention_mask=None):
        input_ids, segment_ids = inputs
        # 构建Mask矩阵，把Pad的位置即input_ids中为0的位置设置为True,非0的位置设置为False
        if attention_mask is None:
            attention_mask = paddle.unsqueeze(
                (input_ids == self.padding_idx).astype("float32") * -1e9, axis=[1, 2] )
        # 抽取特征向量
        embedding_output = self.embeddings(
            input_ids=input_ids, position_ids=position_ids, segment_ids=segment_ids )
        sequence_output = embedding_output
        self._attention_weights = []
        # Transformer的输出和注意力权重的输出
        for i, encoder_layer in enumerate(self.layers):
            sequence_output = encoder_layer(
                sequence_output, src_mask=attention_mask )
        # 选择第0个位置的向量作为句向量
        first_token_tensor = sequence_output[:, 0]
        # 输出层
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        # 句子级别的输出经过分类器
        logits = self.classifier(pooled_output)
        return logits

paddle.seed(2021)
heads_num = 4
epochs = 3
vocab_size=251890
num_classes= 2
# 注意力多头的数目
# 交叉熵损失
criterion = nn.CrossEntropyLoss()
# 评估的时候采用准确率指标
metric = Accuracy()
# Transformer的分类模型
model1 = Model_Transformer_v1(
    vocab_size=vocab_size,
    n_block=1,
    num_classes=num_classes,
    heads_num=heads_num,
    padding_idx=padding_idx,
    pos_stop_gradient=False,
    random_pos=True
)
# 排除所有的偏置和LayerNorm的参数
decay_params1 = [
    p.name for n, p in model1.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]
# 定义 Optimizer
optimizer1 = paddle.optimizer.AdamW(
    learning_rate=5e-5,
    parameters=model1.parameters(),
    weight_decay=0.0,
    apply_decay_param_fun=lambda x: x in decay_params1)

runner1 = RunnerV3(model1, optimizer1, criterion, metric)
save_path1="./checkpoint/model_best1.pdparams"
runner1.train(train_loader, dev_loader, num_epochs=epochs, log_steps=100, eval_steps=500, save_path=save_path1)

# EVALUATE
model_path1 = "checkpoint/model_best1.pdparams"
runner1.load_model(model_path1)
accuracy1, _ =  runner1.evaluate(test_loader)

model2 = Model_Transformer_v1(
    vocab_size=vocab_size,
    n_block=2,
    num_classes=num_classes,
    heads_num=heads_num,
    padding_idx=padding_idx,
    pos_stop_gradient=False,
    random_pos=True
)
# 排除所有的偏置和LayerNorm的参数
decay_params2 = [
    p.name for n, p in model2.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]
# 定义 Optimizer
optimizer2 = paddle.optimizer.AdamW(
    learning_rate=5e-5,
    parameters=model2.parameters(),
    weight_decay=0.0,
    apply_decay_param_fun=lambda x: x in decay_params2)

runner2 = RunnerV3(model2, optimizer2, criterion, metric)
save_path2="./checkpoint/model_best2.pdparams"
runner2.train(train_loader, dev_loader, num_epochs=epochs, log_steps=100, eval_steps=500, save_path=save_path2)

model_path2 = "checkpoint/model_best2.pdparams"
runner2.load_model(model_path2)
accuracy2, _ =  runner2.evaluate(test_loader)

print(f"1-layer transformer model with learnable positional embedding: Accuracy: {accuracy1:.5f}")
print(f"2-layer transformer model with learnable positional embedding: Accuracy: {accuracy2:.5f}")

W0727 14:07:20.769184  6196 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0727 14:07:20.803200  6196 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.


[Train] epoch: 0/3, step: 0/588, loss: 0.96230
[Train] epoch: 0/3, step: 100/588, loss: 0.70999
[Train] epoch: 1/3, step: 200/588, loss: 0.54233
[Train] epoch: 1/3, step: 300/588, loss: 0.43434
[Train] epoch: 2/3, step: 400/588, loss: 0.36233
[Train] epoch: 2/3, step: 500/588, loss: 0.18218
[Evaluate]  dev score: 0.74808, dev loss: 0.59430
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.74808
[Evaluate]  dev score: 0.75896, dev loss: 0.64236
[Evaluate] best accuracy performence has been updated: 0.74808 --> 0.75896
[Train] Training done!
[Train] epoch: 0/3, step: 0/588, loss: 0.83308
[Train] epoch: 0/3, step: 100/588, loss: 0.62887
[Train] epoch: 1/3, step: 200/588, loss: 0.54904
[Train] epoch: 1/3, step: 300/588, loss: 0.35240
[Train] epoch: 2/3, step: 400/588, loss: 0.20508
[Train] epoch: 2/3, step: 500/588, loss: 0.12596
[Evaluate]  dev score: 0.80272, dev loss: 0.52596
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.80272
[Evaluate]  dev sc

可以观察到，都采用可学习的位置编码时，双层的模型的精度（**0.80312**）显著高于单层模型的精度（**0.75352**）。

### 2.训练时候加入warmup的策略，并适当调整warmup的参数，并与不加warmup的实验进行对比。<span style="color:red">(附加题&加分题)</span>

In [3]:
model3 = Model_Transformer_v1(
    vocab_size=vocab_size,
    n_block=2,
    num_classes=num_classes,
    heads_num=heads_num,
    padding_idx=padding_idx,
    pos_stop_gradient=False,
    random_pos=True
)
# 排除所有的偏置和LayerNorm的参数
decay_params3 = [
    p.name for n, p in model3.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

# 定义 Optimizer
linear_warmup_scheduler = paddle.optimizer.lr.LinearWarmup(
        learning_rate=5e-5, warmup_steps=3, start_lr=0, end_lr=5e-5, verbose=False)
optimizer3 = paddle.optimizer.AdamW(
    learning_rate=linear_warmup_scheduler,
    parameters=model3.parameters(),
    weight_decay=0.0,
    apply_decay_param_fun=lambda x: x in decay_params3)

runner3 = RunnerV3(model3, optimizer3, criterion, metric)
save_path3="./checkpoint/model_best3.pdparams"
runner3.train(train_loader, dev_loader, num_epochs=epochs, log_steps=100, eval_steps=500, save_path=save_path3, scheduler=linear_warmup_scheduler)

model_path3 = "checkpoint/model_best3.pdparams"
runner3.load_model(model_path3)
accuracy3, _ =  runner3.evaluate(test_loader)
print(f"2-layer transformer model with learnable positional embedding and linear warm-up scheduler \n : Accuracy: {accuracy3:.5f}")

[Train] epoch: 0/3, step: 0/588, loss: 0.80914
[Train] epoch: 0/3, step: 100/588, loss: 0.63686
[Train] epoch: 1/3, step: 200/588, loss: 0.50187
[Train] epoch: 1/3, step: 300/588, loss: 0.28986
[Train] epoch: 2/3, step: 400/588, loss: 0.24994
[Train] epoch: 2/3, step: 500/588, loss: 0.11478
[Evaluate]  dev score: 0.80160, dev loss: 0.54093
[Evaluate] best accuracy performence has been updated: 0.00000 --> 0.80160
[Evaluate]  dev score: 0.80904, dev loss: 0.55051
[Evaluate] best accuracy performence has been updated: 0.80160 --> 0.80904
[Train] Training done!
2-layer transformer model with learnable positional embedding and linear warm-up scheduler 
 : Accuracy: 0.80408


适当调整线性学习率热身的参数之后，2层的带可学习位置编码的模型在测试集上的准确率从**0.80312**提高到了**0.80408**，提升不显著。

### 3. <span style="color:red">(附加题&简答题&加分题)</span>

    小明是一位建筑设计师，在工作中经常需要手工的查询一些建筑规范条文，规范条文非常的多，每次都需要手工翻阅那些规范条文，非常的不方便。小李是一位计算机的学生，当得知小明的情况后，想用技术手段帮助小明，最开始使用基于关键词的匹配，但是对字面上相同，语义不同和字面上不一样，但语义相似的情况不好处理，于是想利用神经网络的方法来解决这个问题，小李构建了双向LSTM的网络，并在LSTM上加了点积注意力，对于一个小批次的数据不同长度的句子，加入对齐字符进行了对齐，在计算注意力机制的时候，对于一些对齐的字符也计算注意力，并分配注意力权重，最后将训练好的网络融入到规范条文匹配中，请问上述的做法有什么问题？

问题在于**将用于补齐的字符``[pad]``直接纳入到了注意力机制的计算过程中**。

应当设置mask矩阵在``[pad]``对应的位置上赋给很大的负值，这样在得到的注意力分布中，对齐字符的权重接近于0，才能减少特殊字符对注意力机制的影响。