### transformer
transformer 是一种seq2seq模型,他通过不相同的输入序列,来生成不同的输出序列

此外,它的核心是注意力机制,注意力机制可以让模型关注输入序列中的不同位置,从而更好地理解输入序列的语义

注意力机制的本质是通过可学习的权重计算函数，动态分配关注度，让模型聚焦于对当前任务更重要的信息，而非单一的、固定形式的函数。


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def attention(self, q, k, v, d_k, mask=None, dropout=None):
        #请在下⾯编写注意⼒⽅法
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)# 计算注意⼒分数
        if mask is not None:#对mask处理
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
        if dropout is not None:
            scores = dropout(scores)
        return torch.matmul(scores, v)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k).transpose(1, 2)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k).transpose(1, 2)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k).transpose(1, 2)
        scores = self.attention(q, k, v, self.d_k, mask, self.dropout)
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat)
        return output