![Transformer 架构图](https://www.runoob.com/wp-content/uploads/2025/03/Transformer_full_architecture.png)

```
词嵌入（✳）
位置编码（✳）
---
归一化操作
前馈神经网络（增强非线性变换）
残差连接
---
注意力机制
多头自注意力
掩码注意力
交叉注意力
```

In [11]:
import math

# 导入库文件
import torch
from torch import Tensor
from torch import nn as nn
import torch.nn.functional as F

In [12]:
# Embedding (词向量编码)
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_modl):
        super(TokenEmbedding, self).__init__(vocab_size, d_modl, padding_idx=1)

# PositionalEmbedding (位置编码)
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len, device):
        super(PositionalEmbedding,self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False
        pos = torch.arange(0, max_len, device=device)
        pos = pos.float().unsqueeze(dim=1)
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        self.encoding[:,0::2] = torch.sin(pos/(10000**(_2i/d_model)))
        self.encoding[:,1::2] = torch.cos(pos/(10000**(_2i/d_model)))
    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len,:]
 
 # 总编码（词嵌入+位置）   
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding,self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEmbedding(d_model,max_len,device)
        self.drop_out = nn.Dropout(p=drop_prob)
    def forward(self,x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb+pos_emb)

In [13]:
# 多头注意力机制
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.d_model = d_model
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_combine = nn.Linear(d_model,d_model)
        self.softmax = nn.Softmax(dim=-1)
    def forward(self, q, k, v,mask=None):
        batch,time,dimension = q.shape
        n_d = self.d_model//self.n_head
        q,k,v = self.w_q(q),self.w_k(k),self.w_v(v)
        q = q.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
        k = k.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
        v = v.view(batch,time,self.n_head,n_d).permute(0,2,1,3)
        score = q@k.transpose(2,3)/math.sqrt(n_d)
        if mask is not None:
            score = score.masked_fill(mask==0,-1000)
        score = self.softmax(score)@v
        score = score.permute(0,2,1,3).contiguous().view(batch,time,dimension)
        output = self.w_combine(score)
        return output 
# 测试
xi = torch.rand(128,32,512)
attention=MultiHeadAttention(d_model=512,n_head=8)
out = attention(xi,xi,xi)
print(out)
print(out.shape)

tensor([[[ 0.0114,  0.2448,  0.0425,  ..., -0.1600, -0.2929, -0.2216],
         [ 0.0111,  0.2451,  0.0417,  ..., -0.1595, -0.2934, -0.2221],
         [ 0.0115,  0.2451,  0.0424,  ..., -0.1594, -0.2940, -0.2220],
         ...,
         [ 0.0123,  0.2452,  0.0424,  ..., -0.1596, -0.2934, -0.2226],
         [ 0.0119,  0.2453,  0.0412,  ..., -0.1602, -0.2935, -0.2224],
         [ 0.0116,  0.2447,  0.0415,  ..., -0.1601, -0.2932, -0.2219]],

        [[-0.0033,  0.2351,  0.0411,  ..., -0.1433, -0.2982, -0.2241],
         [-0.0020,  0.2354,  0.0396,  ..., -0.1439, -0.2978, -0.2235],
         [-0.0018,  0.2351,  0.0393,  ..., -0.1436, -0.2980, -0.2227],
         ...,
         [-0.0031,  0.2352,  0.0401,  ..., -0.1433, -0.2982, -0.2246],
         [-0.0023,  0.2348,  0.0401,  ..., -0.1430, -0.2971, -0.2240],
         [-0.0032,  0.2338,  0.0399,  ..., -0.1426, -0.2983, -0.2242]],

        [[ 0.0116,  0.2411,  0.0261,  ..., -0.1296, -0.2872, -0.2079],
         [ 0.0118,  0.2402,  0.0249,  ..., -0

In [14]:
# LayNormal (层归一化)
class LayerNorm(nn.Module):
    def __init__(self, d_model, esp=1e-12):
        super(LayerNorm,self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.esp = esp
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        var = x.var(-1,unbiased=False,keepdim=True)
        out = (x-mean)/torch.sqrt(var+self.esp)
        out = self.gamma*out+self.beta
        return out

In [15]:
# 前馈神经网络
class FeedForward(nn.Module):
    def __init__(self, d_model, hidden, dropout=0.1):
        super(FeedForward,self).__init__()
        self.fc1 = nn.Linear(d_model, hidden)
        self.fc2 = nn.Linear(hidden, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
    def forward(self,x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [16]:
# 编码层
class EncoderLayer(nn.Module):
    def __init__(self, d_model, FF_hidden, n_head, dropout=0.1):
        super(EncoderLayer,self).__init__()
        self.attention = MultiHeadAttention(d_model,n_head)
        self.norm1 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.feedforward = FeedForward(d_model,FF_hidden,dropout)
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)
    def forward(self,x,mask=None):
        _x = x
        x = self.attention(x,mask)
        x = self.dropout1(x)
        x = self.norm1(x+_x)
        _x = x
        x = self.feedforward(x)
        x = self.dropout2(x)
        x = self.norm2(x+_x)
        return x

In [17]:
# 完整编码器
class Encoder(nn.Module):
    def __init__(self, vocab_size, max_len, d_model, FF_hidden, n_head, n_layer, device, dropout=0.1):
        super(Encoder,self).__init__()
        self.embedding = TransformerEmbedding(vocab_size,d_model,max_len,dropout,device)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model,FF_hidden,n_head,dropout)
            for _ in range(n_layer)
        ])
    def forward(self,x,s_mask):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x,s_mask)
        return x

In [18]:
# 解码器层
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer,self).__init__()
        self.attention = MultiHeadAttention(d_model,n_head)
        self.norm1 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(drop_prob)
        self.cross_attention = MultiHeadAttention(d_model,n_head)
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(drop_prob)
        self.ffn = FeedForward(d_model,ffn_hidden,drop_prob)
        self.norm3 = LayerNorm(d_model)
        self.dropout3 = nn.Dropout(drop_prob)
    def forward(self, dec, enc, t_mask, s_mask):
        _x = dec
        x = self.attention(dec,dec,dec,t_mask)
        x = self.dropout1(x)
        x = self.norm1(x+_x)
        _x = x
        x = self.cross_attention(x,enc,enc,s_mask) 
        x = self.dropout2(x)
        x = self.norm2(x+_x)
        x = self.ffn(x)
        x = self.dropout3(x)
        x = self.norm3(x+_x)
        return x

In [19]:
# 完整解码器
class Decoder(nn.Module):
    def __init__(self,dec_voc_size,max_len,d_model,ffn_hidden,n_head,n_layer,drop_prob,device):
        super(Decoder,self).__init__()
        self.embedding = TransformerEmbedding(dec_voc_size,d_model,max_len, drop_prob, device)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model,ffn_hidden,n_head,drop_prob)
            for _ in range(n_layer)
        ])
        self.fc = nn.Linear(d_model,dec_voc_size)
    def forward(self,dec,enc,t_mask,s_mask):
        dec=self.embedding(dec)
        for layer in self.layers:
            dec = layer(dec,enc,t_mask,s_mask)
        dec = self.fc(dec)
        return dec