### Attention

Attention(Q,K,V)=softmax(QK^T/sqrt{d_k}) V

In [1]:
import math
import torch
import torch.nn as nn

### 第一个：最基本

In [2]:
### self_attention_v1

class SelfAttentionV1(nn.Module):
    def __init__(self, hidden_dim: int = 728):
        super().__init__()  # 初始化
        self.hidden_dim = hidden_dim
        
        self.query_proj = nn.Linear(hidden_dim, hidden_dim)
        self.key_proj = nn.Linear(hidden_dim, hidden_dim)
        self.value_proj = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):
        # x shape is: (batch_size, seq_len, hidden_dim)
        Q = self.query_proj(x)
        K = self.key_proj(x)
        V = self.value_proj(x)
        # Q, K, V shape is: (batch_size, seq_len, hidden_dim)
        # attention_value shape is: (batch_size, seq_len, seq_len)
        attention_value = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.hidden_dim)
        attention_weight = torch.softmax(attention_value, dim=-1)   # 注意指定维度
        print(attention_weight)

        # 最后shape is (batch_size, seq_len, hidden_dim)
        output = torch.matmul(attention_weight, V)
        return output

X = torch.rand(3,2,4)

net_att_net = SelfAttentionV1(hidden_dim=4)
net_att_net(X)  # 这里forward方法会被自动调用

tensor([[[0.4944, 0.5056],
         [0.4882, 0.5118]],

        [[0.5005, 0.4995],
         [0.5007, 0.4993]],

        [[0.4920, 0.5080],
         [0.4928, 0.5072]]], grad_fn=<SoftmaxBackward0>)


tensor([[[0.4451, 0.4225, 0.3530, 1.3451],
         [0.4440, 0.4240, 0.3538, 1.3457]],

        [[0.2375, 0.3486, 0.2302, 1.1640],
         [0.2374, 0.3486, 0.2301, 1.1639]],

        [[0.2624, 0.1932, 0.2074, 1.0387],
         [0.2619, 0.1932, 0.2071, 1.0385]]], grad_fn=<UnsafeViewBackward0>)

### 第二个：效率优化

In [3]:
### 小网络的优化的self_attention

class SelfAttentionV2(nn.Module):
    def __init__(self, hidden_dim: int = 728):
        super().__init__()
        self.hidden_dim = hidden_dim
        
        self.proj = nn.Linear(hidden_dim, hidden_dim * 3)
    
    def forward(self, x):
        QKV = self.proj(x)
        Q, K, V = torch.split(QKV, self.hidden_dim, dim=-1)
        # 将 QKV 张量在最后一个维度上按每 self.hidden_dim 个元素为一组进行分割，最终得到三个形状为 [..., hidden_dim] 的张量，分别赋值给 Q, K, V。
        attn_weight = torch.softmax(
            torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.hidden_dim), dim = -1
        )
        print(attn_weight)
        output = attn_weight @ V
        return output

X = torch.rand(3,2,4)

net_att_net = SelfAttentionV2(hidden_dim=4)
net_att_net(X)  # 这里forward方法会被自动调用

tensor([[[0.5722, 0.4278],
         [0.5571, 0.4429]],

        [[0.5515, 0.4485],
         [0.5443, 0.4557]],

        [[0.4959, 0.5041],
         [0.4671, 0.5329]]], grad_fn=<SoftmaxBackward0>)


tensor([[[ 0.6875,  0.0884,  0.2283,  0.0983],
         [ 0.6863,  0.0922,  0.2344,  0.0992]],

        [[ 0.6300,  0.2246,  0.4552,  0.0222],
         [ 0.6283,  0.2223,  0.4568,  0.0220]],

        [[ 0.5063,  0.1050,  0.2852, -0.0276],
         [ 0.5021,  0.1134,  0.2738, -0.0379]]], grad_fn=<UnsafeViewBackward0>)

### 第三个：加入细节

In [4]:
# 1. dropout 位置
# 2. attention_mask：每个句子长度不一，要加入pad
# 3. output 矩阵projection

class SelfAttentionV3(nn.Module):
    def __init__(self, hidden_dim, dropout_rate = 0.1):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.proj = nn.Linear(hidden_dim, hidden_dim * 3)
        self.attention_dropout = nn.Dropout(dropout_rate)
        
        # 可选
        self.output_proj = nn.Linear(hidden_dim, hidden_dim)
    
    def forward(self, x, attention_mask = None):
        QKV = self.proj(x)
        Q, K, V = torch.split(QKV, self.hidden_dim, dim=-1)

        attn_weight = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.hidden_dim)
        if attention_mask is not None:
            attn_weight = attn_weight.masked_fill(attention_mask == 0, float('-inf'))
        
        attn_weight = torch.softmax(attn_weight, dim=-1)
        # 在weight层面做dropout，相当于不关注有些词
        attn_weight = self.attention_dropout(attn_weight)
        print(attn_weight)

        attn_result = attn_weight @ V

        output = self.output_proj(attn_result)
        return output
    

X = torch.rand(3,4,2)
# 希望是(batch_size, seq_len, seq_len)的，但现在只是(batch_size, seq_len)的
mask = torch.tensor(
    [
        [1, 1, 1, 0], # 第一个句子长度为3
        [1, 1, 0, 0], # 第二个句子长度为2
        [1, 0, 0, 0]  # 第三个句子长度为1
    ]
)
# repeat用法：维度0不用repeat，维度1重复seq_len次，维度2不变
mask = mask.unsqueeze(dim=1).repeat(1, X.shape[1], 1)  # 扩展到(batch_size, seq_len, seq_len)
print(f"repeat mask shape: {mask.shape}")

net_att_net = SelfAttentionV3(hidden_dim=2)
net_att_net(X, mask)  # 这里forward方法会被自动调用

repeat mask shape: torch.Size([3, 4, 4])
tensor([[[0.3502, 0.0000, 0.0000, 0.0000],
         [0.3440, 0.3924, 0.3747, 0.0000],
         [0.3467, 0.4061, 0.3583, 0.0000],
         [0.3523, 0.3831, 0.3757, 0.0000]],

        [[0.5272, 0.5839, 0.0000, 0.0000],
         [0.5229, 0.5882, 0.0000, 0.0000],
         [0.5282, 0.5829, 0.0000, 0.0000],
         [0.5480, 0.5631, 0.0000, 0.0000]],

        [[1.1111, 0.0000, 0.0000, 0.0000],
         [1.1111, 0.0000, 0.0000, 0.0000],
         [1.1111, 0.0000, 0.0000, 0.0000],
         [1.1111, 0.0000, 0.0000, 0.0000]]], grad_fn=<MulBackward0>)


tensor([[[0.6183, 0.0695],
         [0.5161, 0.8021],
         [0.5184, 0.7953],
         [0.5156, 0.8029]],

        [[0.4883, 0.9489],
         [0.4885, 0.9484],
         [0.4882, 0.9490],
         [0.4871, 0.9512]],

        [[0.4920, 0.9602],
         [0.4920, 0.9602],
         [0.4920, 0.9602],
         [0.4920, 0.9602]]], grad_fn=<ViewBackward0>)

### 第4个：面试写法

In [5]:
class SelfAttentionV4(nn.Module):
    def __init__(self, dim: int, dropout_rate: float = 0.1) -> None:
        super().__init__()
        self.dim = dim
        
        self.query = nn.Linear(dim, dim)
        self.key = nn.Linear(dim, dim)
        self.value = nn.Linear(dim, dim)

        self.attn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x, attn_mask = None):
        # X shape is: (batch_size, seq_len, dim)
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        
        attn_weight = Q @ K.transpose(-2, -1) / math.sqrt(self.dim)
        if attn_mask is not None:
            attn_weight = attn_weight.masked_fill(
                attn_mask == 0, float('-inf')
            )

        attn_weight = torch.softmax(attn_weight, dim=-1)
        attn_weight = self.attn_dropout(attn_weight)
        output = attn_weight @ V
        return output


X = torch.rand(3,4,2)
# 希望是(batch_size, seq_len, seq_len)的，但现在只是(batch_size, seq_len)的
mask = torch.tensor(
    [
        [1, 1, 1, 0], # 第一个句子长度为3
        [1, 1, 0, 0], # 第二个句子长度为2
        [1, 0, 0, 0]  # 第三个句子长度为1
    ]
)
# repeat用法：维度0不用repeat，维度1重复seq_len次，维度2不变
mask = mask.unsqueeze(dim=1).repeat(1, X.shape[1], 1)  # 扩展到(batch_size, seq_len, seq_len)
print(f"repeat mask shape: {mask.shape}")

net_att_net = SelfAttentionV4(dim=2)
net_att_net(X, mask)  # 这里forward方法会被自动调用


repeat mask shape: torch.Size([3, 4, 4])


tensor([[[-1.2154,  0.8047],
         [-1.2120,  0.8057],
         [-1.2121,  0.8053],
         [-0.8528,  0.5364]],

        [[-1.0856,  0.7826],
         [-1.0956,  0.7827],
         [-1.0888,  0.7826],
         [-1.0962,  0.7827]],

        [[-1.0918,  0.8578],
         [-1.0918,  0.8578],
         [-1.0918,  0.8578],
         [-1.0918,  0.8578]]], grad_fn=<UnsafeViewBackward0>)