<a href="https://colab.research.google.com/github/STUPIDTREE/transformer/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

transformer实现

In [24]:
import torch
import torch.nn as nn
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import math
import warnings
from torch.nn.init import xavier_uniform_
from torch.nn.init import xavier_normal_
from torch.nn.init import constant_


# 词嵌入和位置编码


1.   Embeddings实现了把输入的语句转换为嵌入表示
2.   PositionalEncoding实现了三角函数位置编码



In [25]:
class Embeddings(nn.Module):
  def __init__(self, d_model, vocab):
    super().__init__()
    self.lut = nn.Embedding(vocab, d_model)
    self.d_model = d_model

  def forward(self, x):
    return self.lut(x) * math.sqrt(self.d_model)

In [68]:

class PositonalEncoding(nn.Module):
  def __init__(self, max_len=5000, d_model=512, dropout=0.1):
    super().__init__()
    self.dropout = nn.Dropout(dropout)
    P = torch.zeros(1, max_len, d_model)
    pos = torch.arange(0, max_len).unsqueeze(1)
    i = torch.arange(0, d_model, 2)
    div_term = torch.exp(i * -(math.log(10000.0) / d_model))
    P[0, :, 0::2] = torch.sin(pos * div_term)
    P[0, :, 1::2] = torch.cos(pos * div_term)
    # self.P = Parameter(P, requires_grad=False) # 这种写法是作为模型参数但不参与梯度计算，即不变参数
    # 这种写法不出现在state_dict中，做为普通常数张量
    self.register_buffer('P', P)

  def forward(self, x):
    # 默认x形状为 (batch, seq_len, d_model)
    if x.dim() == 2:  # 说明缺少 batch 维度
      x = x.unsqueeze(0)  # 变成 (1, seq_len, d_model)
    # x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False) #Variable为旧版本写法，Variable 现在已经合并到 Tensor，可以直接写成x.detach()
    x = x + self.P[:, :x.size(1), :]
    return self.dropout(x)



测试

In [71]:
x = torch.zeros((2, 4), dtype=torch.long)
emb_layer = Embeddings(8, 10)
emb = emb_layer(x)
print(emb.shape)

torch.Size([2, 4, 8])


In [28]:
pe_layer = PositonalEncoding(10, 8)
x = pe_layer(emb)
print(x.shape)

torch.Size([2, 4, 8])


# 编码器实现



1.   Encoder实现了编码器的多层结构，论文实现为6层；
2.   EncoderLayer实现了编码器每一层内部的两层连接，包含一个MHA层，及一个FFN层；
3.   SublayerConnection实现了MHA层和FFN层的子功能层+add&norm；
4.   MultiHeadAttention是MHA的实现，调用了attention函数，attention实现了QKV的矩阵计算；
5.   PositionwiseFeedForward实现了FFN全连接。



In [29]:
def attention(query, key, value, mask=None, dropout=None):
  d_k = query.size(-1) # 对应词嵌入维度？
  scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
  if mask is not None:
    scores = scores.masked_fill(mask == 0, -1e9)

  p_attn = F.softmax(scores, dim = -1)

  if dropout is not None:
    # 这样写对吗？
    p_attn = dropout(p_attn)

  return torch.matmul(p_attn, value), p_attn


In [30]:

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, d_model, dropout=0.1):
    super().__init__()
    self.num_heads = num_heads
    self.dropout = dropout

    assert d_model % num_heads == 0
    self.d_k = d_model // num_heads
    self.h = num_heads

    self.linears = nn.ModuleList(nn.Linear(d_model, d_model) for _ in range(4))
    self.attn = None
    self.dropout = nn.Dropout(dropout)

  def forward(self, query, key, value, mask=None):
    if mask is not None:
      mask = mask.unsqueeze(1) # 代表多头中的第n头？

    num_batches = query.size(0)
    # q, k, v经过一次线性变换，重新调整形状，拆分为h个d_k维度的小块(多头注意力实现，和self-attention一样的算力，但效果更好),
    # [batch, seq_len, d_model] -> [batch, seq_len, h, d_k]
    query, key, value = [l(x).view(num_batches, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linears, (query, key, value))]

    x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
    # [batch, seq_len, h, d_k] -> [batch, seq_len, d_model]
    x = x.transpose(1, 2).contiguous().view(num_batches, -1, self.h * self.d_k)

    return self.linears[-1](x)


In [31]:
mha_layer = MultiHeadAttention(8, 512)
q,k,v = torch.rand((2, 4, 512)), torch.rand((2, 4, 512)), torch.rand((2, 4, 512))
x = mha_layer(q, k, v)
print(x.shape)

torch.Size([2, 4, 512])


In [32]:
def clones(module, N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [33]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
      super().__init__()
      self.layers = clones(layer, N)
      self.norm = LayerNorm(layer.d_model)

    def forward(self, x, mask):
      for layer in self.layers:
        x = layer(x,mask)
        return self.norm(x)

In [34]:
class SublayerConnection(nn.Module):
  def __init__(self, size, dropout):
    super().__init__()
    self.norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, sublayer):
    sublayer_out = sublayer(x)
    return self.norm(x + self.dropout(sublayer_out))


In [35]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, self_attn, feed_forward, dropout):
    super().__init__()
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(d_model, dropout), 2)
    self.d_model = d_model

  def forward(self, x, mask):
    x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
    z = self.sublayer[1](x, self.feed_forward)
    return z

In [36]:
class PositionwiseFeedForward(nn.Module):
  def __init__(self, d_model, d_ff, dropout=0.1):
    super().__init__()
    self.w1 =nn.Linear(d_model, d_ff)
    self.w2 = nn.Linear(d_ff, d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    return self.w2(self.dropout(F.relu(self.w1(x))))

自定义LayerNorm函数，gamma和beta是可训练的参数。
如果你只是想用标准的层归一化，建议直接用 nn.LayerNorm。但如果你需要：

自定义 gamma 和 beta 的初始化
在 forward 中进行其他特殊操作
使用不同的归一化策略（如批量归一化或组归一化）
那么自定义 LayerNorm 可能更合适。

In [37]:
class LayerNorm(nn.Module):
  def __init__(self, feature_size, eps=1e-6):
    super().__init__()
    self.eps = eps
    # 这里需要重新理解
    self.gamma = nn.Parameter(torch.ones(feature_size))
    self.beta = nn.Parameter(torch.zeros(feature_size))

  def forward(self, x):
    mean = x.mean(-1, keepdim=True)
    std = x.std(-1, keepdim=True)
    return self.gamma * (x - mean) / (std + self.eps) + self.beta


In [38]:
def subsequent_mask(size):
  attn_shape = (1, size, size)
  subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')

  return torch.from_numpy(subsequent_mask) == 0

# 解码器实现
解码器部分结构：


1.   Decoder,实现了解码器的6层组成结构；
2.   DecoderLayer，实现了每一个解码器层的实现，由三部分组成，自注意力mask-MHA，交叉注意力MHA，及全连接层FFN；
3.   Geneator, 解码器最终的输出层，包含一个线性映射层和一个softmax层。

In [39]:
class Decoder(nn.Module):
  def __init__(self, layer, N):
    super().__init__()
    self.layers = clones(layer, N)
    self.norm = LayerNorm(layer.d_model)

  def forward(self, x, memory, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, memory, src_mask, tgt_mask)
    return self.norm(x)

In [40]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, self_attn, src_attn, feed_forward, dropout):
    super().__init__()
    self.d_model = d_model
    self.self_attn = self_attn
    self.src_attn = src_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(d_model, dropout), 3)

  def forward(self, x, memory, src_mask, tgt_mask):
    x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
    x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))

    return self.sublayer[2](x, self.feed_forward)


In [41]:



class Generator(nn.Module):
  def __init__(self, d_model, vocab):
    super().__init__()
    self.proj = nn.Linear(d_model, vocab)

  def forward(self, x):
    return F.log_softmax(self.proj(x), dim=-1)

# 模型构造

In [42]:
class EncoderDecoder(nn.Module):
  def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embed
    self.tgt_embed = tgt_embed
    self.generator = generator

  def forward(self, x, tgt, src_mask, tgt_mask):
    memory = self.encoder(self.src_embed(x), src_mask)
    x = self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
    return self.generator(x)


In [43]:
import copy
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
  dc = copy.deepcopy
  attn = MultiHeadAttention(h, d_model)
  ff = PositionwiseFeedForward(d_model, d_ff, dropout)
  pos = PositonalEncoding(d_model=d_model, dropout=dropout)
  encoder = Encoder(EncoderLayer(d_model, dc(attn), dc(ff), dropout), N)
  decoder = Decoder(DecoderLayer(d_model, dc(attn), dc(attn), dc(ff), dropout), N)
  src_embed = nn.Sequential(Embeddings(d_model, src_vocab), dc(pos))
  tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab), dc(pos))
  generator = Generator(d_model, tgt_vocab)
  model = EncoderDecoder(
      encoder, decoder, src_embed, tgt_embed, generator
  )

  for p in model.parameters():
    if p.dim() > 1:
      nn.init.xavier_uniform_(p)
  return model


# 数据集生成

In [44]:
import numpy as np
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]    # decoder的输入（即期望输出除了最后一个token以外的部分)
            self.trg_y = trg[:, 1:]   # decoder的期望输出（trg基础上再删去句子起始符）
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        """
        Create a mask to hide padding and future words.
        padd 和 future words 均在mask中用0表示
        """
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)
        return tgt_mask


# Synthetic Data
def data_gen(V, slen, batch, nbatches, device):
    """
    Generate random data for a src-tgt copy task.
    V: 词典数量，取值范围[0, V-1]，约定0作为特殊符号使用代表padding
    slen: 生成的序列数据的长度
    batch: batch_size
    nbatches: number of batches to generate
    """
    for i in range(nbatches):

        data = torch.from_numpy(np.random.randint(2, V, size=(batch, slen)))
        # 约定输出为输入除去序列第一个元素，即向后平移一位进行输出，同时输出数据要在第一个时间步添加一个起始符
        # 因此，加入输入数据为  [3, 4, 2, 6, 4, 5]
        # ground truth输出为 [1, 4, 2, 6, 4, 5]
        tgt_data = data.clone()
        tgt_data[:, 0] = 1   # 将序列的第一个时间步置为1(即约定的起始符)，即可完成GT数据的构造
        src = data.clone().detach().to(data.device)
        tgt = tgt_data.clone().detach().to(data.device)
        if device == "cuda":
            src = src.cuda()
            tgt = tgt.cuda()
        yield Batch(src, tgt, 0)


# test data_gen
data_iter = data_gen(V=5, slen=10, batch=2, nbatches=10, device="cpu")
for i, batch in enumerate(data_iter):
    print("\nbatch.src")
    print(batch.src.shape)
    print(batch.src)
    print("\nbatch.trg")
    print(batch.trg.shape)
    print(batch.trg)
    print("\nbatch.trg_y")
    print(batch.trg_y.shape)
    print(batch.trg_y)
    print("\nbatch.src_mask")
    print(batch.src_mask.shape)
    print(batch.src_mask)
    print("\nbatch.trg_mask")
    print(batch.trg_mask.shape)
    print(batch.trg_mask)
    break
#raise RuntimeError()


batch.src
torch.Size([2, 10])
tensor([[2, 3, 4, 2, 4, 3, 3, 4, 4, 2],
        [3, 2, 4, 2, 3, 3, 3, 4, 4, 4]])

batch.trg
torch.Size([2, 9])
tensor([[1, 3, 4, 2, 4, 3, 3, 4, 4],
        [1, 2, 4, 2, 3, 3, 3, 4, 4]])

batch.trg_y
torch.Size([2, 9])
tensor([[3, 4, 2, 4, 3, 3, 4, 4, 2],
        [2, 4, 2, 3, 3, 3, 4, 4, 4]])

batch.src_mask
torch.Size([2, 1, 10])
tensor([[[True, True, True, True, True, True, True, True, True, True]],

        [[True, True, True, True, True, True, True, True, True, True]]])

batch.trg_mask
torch.Size([2, 9, 9])
tensor([[[ True, False, False, False, False, False, False, False, False],
         [ True,  True, False, False, False, False, False, False, False],
         [ True,  True,  True, False, False, False, False, False, False],
         [ True,  True,  True,  True, False, False, False, False, False],
         [ True,  True,  True,  True,  True, False, False, False, False],
         [ True,  True,  True,  True,  True,  True, False, False, False],
         

# 训练

In [45]:
def run_epoch(data_iter, model, loss_compute, device=None):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg,
                            batch.src_mask, batch.trg_mask)
        # print("out.shape = ", out.shape)
        # print("batch.trg_y.shape = ", batch.trg_y.shape)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens


class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, true_dist.detach())


class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        """
        norm: loss的归一化系数，用batch中所有有效token数即可
        """
        # print("x.shape = ", x.shape)
        # print("y.shape = ", y.shape)
        # x = self.generator(x)
        x_ = x.contiguous().view(-1, x.size(-1))
        y_ = y.contiguous().view(-1)
        loss = self.criterion(x_, y_)
        loss /= norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.zero_grad()
        return loss.item() * norm

In [70]:
import time
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# -----------------------------------
# A Easy Example
# -----------------------------------
# Train the simple copy task.
device = "cuda"
nrof_epochs = 40 # 40
batch_size = 32 # 32
V = 11    # 词典的数量
sequence_len = 15  # 生成的序列数据的长度
nrof_batch_train_epoch = 30    # 训练时每个epoch多少个batch
nrof_batch_valid_epoch = 10    # 验证时每个epoch多少个batch
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, V, N=2)
#optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9)
if device == "cuda":
    model.cuda()

for epoch in range(nrof_epochs):
    print(f"\nepoch {epoch}")
    print("train...")
    model.train()
    data_iter = data_gen(V, sequence_len, batch_size, nrof_batch_train_epoch, device)
    loss_compute = SimpleLossCompute(model.generator, criterion, optimizer)
    train_mean_loss = run_epoch(data_iter, model, loss_compute, device)
    print("valid...")
    model.eval()
    valid_data_iter = data_gen(V, sequence_len, batch_size, nrof_batch_valid_epoch, device)
    valid_loss_compute = SimpleLossCompute(model.generator, criterion, None)
    valid_mean_loss = run_epoch(valid_data_iter, model, valid_loss_compute, device)
    print(f"valid loss: {valid_mean_loss}")


# greedy decode
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    embed = model.src_embed(src)
    memory = model.encoder(embed, src_mask)
    # ys代表目前已生成的序列，最初为仅包含一个起始符的序列，不断将预测结果追加到序列最后
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        ys_embed = model.tgt_embed(ys)
        out = model.decoder(ys_embed, memory, src_mask,
                           subsequent_mask(ys.size(1)).type_as(src.data))
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        # next_word = next_word.item()
        # ys = torch.cat([ys, torch.ones(1, 1, dtype=torch.long, device=src.device).fill_(next_word)], dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys

print("greedy decode")
model.eval()
src = torch.LongTensor([1, 2, 3, 4, 4, 6, 7, 8, 10, 10]).cuda()
src_mask = torch.ones(1, 1, 10).cuda()
pred_result = greedy_decode(model, src, src_mask, max_len=10, start_symbol=1)
print(pred_result[:, 1:])


epoch 0
train...
Epoch Step: 1 Loss: 2.583433 Tokens per Sec: 8827.085938
valid...
Epoch Step: 1 Loss: 1.940773 Tokens per Sec: 11004.223633
valid loss: 1.9794069528579712

epoch 1
train...
Epoch Step: 1 Loss: 3.508346 Tokens per Sec: 11272.271484
valid...
Epoch Step: 1 Loss: 2.257387 Tokens per Sec: 12345.225586
valid loss: 2.215728998184204

epoch 2
train...
Epoch Step: 1 Loss: 3.143415 Tokens per Sec: 8437.614258
valid...
Epoch Step: 1 Loss: 2.389766 Tokens per Sec: 9935.535156
valid loss: 2.458388328552246

epoch 3
train...
Epoch Step: 1 Loss: 2.960128 Tokens per Sec: 9040.274414
valid...
Epoch Step: 1 Loss: 2.104912 Tokens per Sec: 11155.229492
valid loss: 2.0838000774383545

epoch 4
train...
Epoch Step: 1 Loss: 2.663283 Tokens per Sec: 10862.366211
valid...
Epoch Step: 1 Loss: 2.073424 Tokens per Sec: 12312.019531
valid loss: 2.0655457973480225

epoch 5
train...
Epoch Step: 1 Loss: 2.663819 Tokens per Sec: 11064.809570
valid...
Epoch Step: 1 Loss: 2.392645 Tokens per Sec: 12363.