### 实现多头注意力机制实现
$$
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$
代码实现


In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super.__init__()

        # 初始化模块属性
        self.num_heads = num_heads # 多头注意力头数
        self.d_model = d_model # 模型维度
        self.depth = d_model // num_heads #每个头的维度

        #定义权重矩阵
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)
        self.Wv = nn.Linear(d_model, d_model)

        # 定义最终的线性层
        self.fc = nn.Linear(d_model, d_model)

        # 定义dropout层
        self.dropout= nn .Dropout(p = dropout)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-1, -2))/ torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))

        # 如果存在掩码，应用掩码
        if mask is not None:
            scores += mask * 1e-9
        # 计算soft Max
        attention = F.softmax(scores, dim=-1)
        # 应用dropout
        attention = self.dropout(attention)
        # 将注意力得分乘以value向量
        output = torch.matmul(attention, V)
        return output, attention
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)

        # 线性投影
        Q = self.Wq(Q).view(batch_size, -1, self.num_heads, self.depth).transpose(1, 2)
        K = self.Wk(K).view(batch_size, -1, self.num_heads, self.depth).transpose(1, 2)
        V = self.Wv(V).view(batch_size, -1, self.num_heads, self.depth).transpose(1, 2)

        # Scaled Dot-Product Attention
        scores, attention = self.scaled_dot_product_attention(Q, K, V, mask=mask)
        concat = scores.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

        # 最总的线形投影
        output = self.fc(concat)
        return output, attention
        pass