# Transformer Note
- 代码完全参考[The Annotated Transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html)
- 仅补充部分个人理解与分析

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt

def cloneModel(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

## 模型本身构成分析
- 定义使用的`LayerNorm`函数，将输入归一化后映射处理
- 默认无偏，故`bias`全0

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, featureNum, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.mapping = nn.Parameter(torch.ones(featureNum))
        self.bias = nn.Parameter(torch.zeros(featureNum))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        # mapping矩全1，保留去中心化后的输出结果，利用eps参数控制基础分布散度，并在最后补入偏差bias（全0）
        return self.mapping * (x - mean) / (std + self.eps) + self.bias

- 包裹`Attention`和`LayerNorm`两层，将通过`LayerNorm`处理后的输入经由指定网络层`sublayer`并在最后讲结果与原输入拼接（整个过程中`Embedding`的维度不发生变化）

In [None]:
class ResidualConnection(nn.Module):
    def __init__(self, featureNum, dropout):
        super(ResidualConnection, self).__init__()
        self.norm = LayerNorm(featureNum)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, sublayer):
        # 注意，此处需要给出的sublayer在处理数据时，不对数据维度进行扰动，以确保输出结果矩阵能够满足Residual求和
        return x + self.dropout(sublayer(self.norm(x)))

- 进行Encode处理，每穿过`Attention`层后进行一次`Feedforward`，采用残差链接的方式
- 传入的`x`即为`key`，`query`以及`value`,注意三者会经过三个不同的Linear层进行映射，以来进行`self-attention`的相关计算

In [None]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        # Query,Key,Value三者通过多层Multihead-Attention进行注意力捕捉
        self.layers = cloneModel(layer, N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, featureNum, selfAttn, feedforward, dropout):
        super(EncoderLayer, self).__init__()
        self.selfAttn = selfAttn
        self.feedforward = feedforward
        # 两个Residual层，分别用于Attention和Feedforward层的梯度流构建
        self.sublayer = cloneModel(ResidualConnection(featureNum, dropout), 2)
        self.featureNum = featureNum

    def forward(self, x, mask):
        # Sel-multihead-attention捕捉分析流程（用自身作为Key，Query，Value进行Attention关注训练）
        x = self.sublayer[0](x, lambda x: self.selfAttn(x, x, x, mask))
        return self.sublayer[1](x, self.feedforward)

- Mask的创建操作

In [None]:
def subsquentMask(featureNum):
    attnShape = (1, featureNum, featureNum)
    # np.triu()返回了给定形状矩阵的上三角矩阵，在Mask步骤中，我们的目的是屏蔽未见的未来信息，利用三角矩阵特性，对输入信息进行padding
    subsquentMask = np.triu(np.ones(attnShape), k=1).astype('uint8')
    return torch.from_numpy(subsquentMask) == 0

- 每个`Attention`的定义，利用矩阵相乘求取所得的`Attention`会经由`Softmax`函数进行进一步拉伸
- $Softmax(Q·K^T)·V/\sqrt{Dim}$

In [None]:
def attention(query, key, value, mask=None, dropout=None):
    dimension = query.size(-1)
    attnScore = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(dimension)
    if mask is not None:
        # 纯0项不能帮助Softmax进一步扭曲分布状态
        attnScore = attnScore.masked_fill(mask == 0, -1e9)
    twsAttn = F.softmax(attnScore, dim=-1)
    if dropout is not None:
        twsAttn = dropout(twsAttn)
    # 输出内容为：当前Attention关注的原文内容，以及Attention计算结果本身
    return torch.matmul(twsAttn, value), twsAttn

- 四个`Linear`层中的前三个分别对`key`，`query`，`value`进行线性映射，调整其`Embedding`后计算`Attention`，最后一层`Linear`用于对`Attention`的计算结果进行线性映射

In [None]:
class MultiheadAttention(nn.Module):
    def __init__(self, headNum, dimension, dropout=0.1):
        super(MultiheadAttention, self).__init__()
        self.vocabDim = dimension // headNum
        self.headNum = headNum
        self.linears = cloneModel(nn.Linear(dimension, dimension), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key ,value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        batchSize = query.size(0)
        # Query，Key，Value三个值通过Linear层进行映射，做输入Embedding调整
        query, key, value = [linear(x).view(batchSize, -1, self.headNum, self.vocabDim) for linear, x in zip(self.linears, (query, key, value))]
        # 调用Attention计算，将Linear变换后的三者作为输入进行Self-attention训练，此处x所得即为本输入经由Attention捕捉的关注内容
        x, self.attn = attention(query, key, value, mask, self.dropout)
        x = x.transpose(1, 2).contiguous().view(batchSize, -1, self.headNum, self.vocabDim)
        # 最后将当前关注内容在进行依次Liner映射，进行最终调整并输出
        return self.linears[-1](x)

- `Feedforward`层，将所得的`Embedding`进行一次维度变化和还原来扰动和调整输出结果

In [None]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, dimension, feedforwardDim, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(dimension, feedforwardDim)
        self.linear2 = nn.Linear(feedforwardDim, dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # 通过两个Linear层进行维度的折叠扰动（类似DarkNet的维度折叠展开块）
        x = self.linear1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

- `Embedding`层，将词映射到指定维度的词向量

In [None]:
class Embedding(nn.Module):
    def __init__(self, dimension, vocab):
        super(Embedding, self).__init__()
        # 获取pre-trained词向量
        self.embed = nn.Embedding(vocab, dimension)
        self.dimension = dimension

    def forward(self, x):
        return self.embed(x) * math.sqrt(self.dimension)

- 对于位置编码，其要求在指定的`Sequence`长度范围内，能够衡量两两`token`间的位置距离且各`token`的位置表示独特唯一，并且值位于`[-1, 1]`区间之内
- 采用$PosEncode_{(position, 2*embedDim)} = Sin(position/10000^{2*embedDim/dimension})$对位置进行编码计算

In [None]:
class PosEncoding(nn.Module):
    def __init__(self, dimension, dropout, maxSeqLen=5000):
        super(PosEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        posEncode = torch.zeros(maxSeqLen, dimension)
        position = torch.arange(0, maxSeqLen).unsqueeze(1)
        divTerm = torch.exp(torch.arange(0, dimension, 2) * -(math.log(10000.0) / dimension))
        posEncode = posEncode.unsqueeze(0)
        # 进行Position Encoding操作
        posEncode[:, 0::2] = torch.sin(position * divTerm)
        posEncode[:, 1::2] = torch.cos(position * divTerm)
        self.registerBuffer('posEncode', posEncode)
    
    def forward(self, x):
        x = x + Variable(self.posEncode[:, :x.size(1)], require_grad=False)
        return self.dropout(x)

- `Decoder`部分，用于进行解码工作，将Encode的结果，根据网络记忆内容（能看到之前时刻的，不超过最大Sequence限长的累计输入），根据原先的Mask映射到原本输入上

In [None]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        # 与Encoder同理，为多层Multihead-Attention复合结构
        self.layers = cloneModel(layer, N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, x, memory, srcMask, tarMask):
        for layer in self.layers:
            x = layer(x, memory, srcMask, tarMask)
        return self.norm(x)

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, featureNum, selfAttn, srcAttn, feedforward, dropout):
        super(DecoderLayer, self).__init__()
        self.featureNum = featureNum
        self.selfAttn = selfAttn
        # 用于解码，通过Encoder提供的Key，Query和自身的Self-Attention提供的Value进行Attention捕捉训练
        self.srcAttn = srcAttn
        self.feedforward = feedforward
        self.sublayer = cloneModel(ResidualConnection(featureNum, dropout), 3)

    def forward(self, x, memory, srcMask, tarMask):
        memo = memory
        # 获取Self-Attention输出，作为最终的Value使用
        x = self.sublayer[0](x, lambda x: self.selfAttn(x, x, x, tarMask))
        # memo记录Encoder的提供输入，即Decoder的输出，作为Key与Value使用，同时，由于输入来自于Encoder，需要利用srcMask进行Mask操作
        x = self.sublayer[1](x, lambda x: self.srcAttn(x, memo, memo, srcMask))
        return self.sublayer[2](x, self.feedforward)

- 整体`Transformer`的训练流程模型

In [None]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, srcEmbed, tarEmbed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.srcEmbed = srcEmbed
        self.tarEmbed = tarEmbed
        self.generator = generator

    def forward(self, src, tar, srcMask, tarMask):
        # 完整的Encode，Decode流程：Encode的结果是条件，Decode的结果是目标
        encode = self.encode(src, srcMask)
        decode = self.decode(encode, srcMask, tar, tarMask)
        return decode
    
    def encode(self, src, srcMask):
        return self.encoder(self.srcEmbed(src), srcMask)

    def decode(self, memory, srcMask, tar, tarMask):
        return self.decoder(self.tarEmbed(tar), memory, srcMask, tarMask)

- 最终输出的生成，进行向输入词向量维度的还原映射工作

In [None]:
class Generator(nn.Module):
    def __init__(self, dimension, vocab):
        super(Generator, self).__init__()
        # 利用Linear层向目标词向量维度变换
        self.linear = nn.Linear(dimension, vocab)

    def forward(self, x):
        # 利用Softmax进行概率化输出
        return F.log_softmax(self.linear(x), dim=1)

In [1]:
def buildModel(srcVocab, tarVocab, N=6, dimension=512, feedforwardDim=2048, headNum=8, dropout=0.1):
    cp = copy.deepcopy
    # Multihead-Attention，Feedforward， Position-Encoding三层准备
    attentions = MultiheadAttention(headNum, dimension)
    feedforward = PositionwiseFeedForward(dimension, feedforwardDim, dropout)
    position = PosEncoding(dimension, dropout)
    # 构建模型本体：加码，解码并转化到词向量各维概率输出
    model = EncoderDecoder(
        Encoder(EncoderLayer(dimension, cp(attentions), cp(feedforward), dropout), N),
        Decoder(DecoderLayer(dimension, cp(attentions), cp(attentions), cp(feedforward), dropout), N),
        nn.Sequential(Embedding(dimension, srcVocab), cp(position)),
        nn.Sequential(Embedding(dimension, tarVocab), cp(position)),
        Generator(dimension, tarVocab)
    )
    # 参数初始化，使其服从xavier型式，加速训练梯度流向
    for param in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

## 模型训练部分函数分析
- 此处mask对输入信息进行限制，即在按顺序读取的假设前提下，不会知道在当前输入之后的信息
- 进行mask后能够屏蔽后续输入的原因可以[查看此处](https://blog.csdn.net/qq_35169059/article/details/101678207)

In [None]:
class Batch:
    def __init__(self, src, tar=None, padding=0):
        # 源输入封装如src下属性
        self.src = src
        self.srcMask = (src != padding).unsqueeze(-2)
        if tar is not None:
            # tarX为当前处理内容，tarY为其之后一位内容，即当前任务的下一位，作为目标
            self.tarX = tar[:, :-1]
            self.tarY = tar[:, 1:]
            self.tarMask = self.makeStdMask(self.tarX, padding)
            self.ntokens = (self.tarY != padding).data.sum()
        
    @staticmethod
    def makeStdMask(target, padding):
        targetMask = (target != padding).unsqueeze(-2)
        targetMask = targetMask & Variable(subsquentMask(target.size(-1)).type_as(targetMask.data))
        return targetMask

- 对采用的`Adam`优化方法在训练时进行进一步的学习率管理，使其更适用于Transformer的训练调优

In [None]:
class NoamOptimizer:
    def __init__(self, featureNum, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.featureNum = featureNum
        self._rate = 0

    # 在每次更新数据后，渐进式的调整学习率等参数
    def step(self):
        self._step += 1
        rate = self.rate()
        for param in self.optimizer.param_groups:
            param['lr'] = rate
        self._rate = rate
        self.optimizer.step()
    
    def rate(self, step=None):
        if step is None:
            step = self._step
        return self.factor * (self.featureNum ** (-0.5) * min(step ** (-0.5), step * (self.warmup) ** (-1.5)))

def getOptimizer(model):
    return NoamOptimizer(model.srcEmbed[0].dimension, 2, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

- 修改后的`Adam`优化器产生的`Loss`将在任务特定的Loss计算中被向后传播，进行参数优化

In [None]:
class LossCompute:
    def __init__(self, generator, criterion, opt=None):
        # 用于进行输出维度调整，将其与目标格式/含义对齐
        self.generator = generator
        # Loss自定义计算标准，将模型输出值列表x，与真实标记列表y进行对比求取总Loss
        self.criterion = criterion
        self.opt = opt
    
    def __call__(self, x, y, norm):
        x = self.generator(x)
        # Norm一般为BatchSize，即当前一个批次处理了多少数据样本
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)) / norm
        # 在此处启动梯度回溯，进行参数更新
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data[0] * norm

- 故我们每次训练的`Epoch`执行逻辑如下。

In [None]:
def runEpoch(dataIter, model, lossCompute):
    totalTokens = 0
    totalLoss = 0
    for i, batch in enumerate(dataIter):
        output = model.forward(batch.src, batch.tarX, batch.srcMask, batch.tarMask)
        # 注意模型output与目标输出TargetY并不一定维度对应，传入的lossCompute函数需要去定义output的转化方案和loss的计算准则
        loss = lossCompute(output, batch.tarY, batch.ntokens)
        totalLoss += loss
        totalTokens += batch.ntokens
        if i % 100 == 1:
            print("Epoch Step: %d Loss: %f" %(i, loss / batch.ntokens))
    return totalLoss / totalTokens