In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

# 关于word embedding，以序列建模为例
# 考虑source sentence 和 target sentence（离散）
# 构建序列，序列的字符以索引的形式表示
batch_size = 2

# 单词表大小
max_num_src_words = 8
max_num_tgt_words = 8

# 句子的最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5

# 句子长度
# src_len = torch.randint(2, 5, (batch_size,))  # 随机生成句子长度
# tgt_len = torch.randint(2, 5, (batch_size,))
src_len = torch.Tensor([2, 4]).to(torch.int32)  
tgt_len = torch.Tensor([4, 3]).to(torch.int32)

# Source sentence & target sentence

In [8]:
# 单词索引(token ID)构成的源句子和目标句子，构建batch，并且做了padding，默认值为0
## 使用多行代码实现
src_seq_list = []  # 用于存储处理后的序列

for L in src_len:
    # 生成一个形状为 (L,) 的随机整数张量，表示长度为 L 的输入序列
    rand_seq = torch.randint(1, max_num_src_words, (L,))

    # 使用 F.pad 在右侧填充 0，使序列长度变为 max_src_seq_len
    padded_seq = F.pad(rand_seq, (0, max_src_seq_len - L))

    # 在第一维（批次维度）上增加一个维度，使其变为 (1, max_src_seq_len)
    expanded_seq = torch.unsqueeze(padded_seq, 0)

    # 将处理后的序列添加到列表中
    src_seq_list.append(expanded_seq)

# 通过 torch.cat 在批次维度 (dim=0) 上拼接所有序列
src_seq = torch.cat(src_seq_list, dim=0)  # 最终形状为 (batch_size, max_src_seq_len)

## 只使用一行代码实现
# src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max_src_seq_len-L)), 0) for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max_tgt_seq_len-L)), 0) for L in tgt_len])

src_seq, tgt_seq

(tensor([[5, 2, 0, 0, 0],
         [5, 1, 1, 2, 0]]),
 tensor([[7, 3, 7, 6, 0],
         [2, 2, 4, 0, 0]]))

# Word embedding

In [9]:
# 词向量维度
model_dim = 8

# 构造embedding（调用Pytorch的embedding api）

# embedding的词表
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim)  # padding的字符占用了0这一行的索引，所以要“+1”
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim)
src_embedding_table.weight

Parameter containing:
tensor([[-0.2204, -0.1895,  0.1097,  0.3113, -0.2367,  1.5414,  0.5483,  0.1560],
        [ 0.7961, -0.7196, -2.5690,  0.3586, -0.3528,  0.8761,  0.1312, -2.3167],
        [-0.1066,  0.3241,  0.1484, -0.7014, -1.3806, -0.6581, -0.1047, -1.5250],
        [ 1.0106, -0.6918, -1.2055,  0.9626,  1.6071,  1.1457, -0.7686,  0.1197],
        [ 1.6573, -1.3230, -0.3683, -0.1587,  0.1823, -0.7442,  1.7799, -0.3599],
        [ 2.1197,  1.5243,  0.1713, -0.6293,  0.8753, -0.4871, -1.0168,  1.6978],
        [-0.1117,  0.8207,  0.5764,  0.4387,  0.7001,  1.7748, -1.8028,  1.0397],
        [-0.6761,  0.2510, -0.4549, -1.1422, -0.9927, -1.1721,  1.1605,  0.3645],
        [-0.9596, -0.9147,  0.7747,  1.3046, -1.1128, -0.3725, -0.9399,  0.1322]],
       requires_grad=True)

In [13]:
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)

# src
src_embedding_table.weight, src_seq, src_embedding

(Parameter containing:
 tensor([[-0.2204, -0.1895,  0.1097,  0.3113, -0.2367,  1.5414,  0.5483,  0.1560],
         [ 0.7961, -0.7196, -2.5690,  0.3586, -0.3528,  0.8761,  0.1312, -2.3167],
         [-0.1066,  0.3241,  0.1484, -0.7014, -1.3806, -0.6581, -0.1047, -1.5250],
         [ 1.0106, -0.6918, -1.2055,  0.9626,  1.6071,  1.1457, -0.7686,  0.1197],
         [ 1.6573, -1.3230, -0.3683, -0.1587,  0.1823, -0.7442,  1.7799, -0.3599],
         [ 2.1197,  1.5243,  0.1713, -0.6293,  0.8753, -0.4871, -1.0168,  1.6978],
         [-0.1117,  0.8207,  0.5764,  0.4387,  0.7001,  1.7748, -1.8028,  1.0397],
         [-0.6761,  0.2510, -0.4549, -1.1422, -0.9927, -1.1721,  1.1605,  0.3645],
         [-0.9596, -0.9147,  0.7747,  1.3046, -1.1128, -0.3725, -0.9399,  0.1322]],
        requires_grad=True),
 tensor([[5, 2, 0, 0, 0],
         [5, 1, 1, 2, 0]]),
 tensor([[[ 2.1197,  1.5243,  0.1713, -0.6293,  0.8753, -0.4871, -1.0168,
            1.6978],
          [-0.1066,  0.3241,  0.1484, -0.7014, -1.3

In [14]:
tgt_embedding_table.weight, tgt_seq, tgt_embedding

(Parameter containing:
 tensor([[-1.3400,  0.0219,  1.1850,  0.1048,  1.1570, -1.3731, -1.1395,  0.1921],
         [-1.0655, -1.2237, -1.2906, -0.2561, -0.6014,  0.5297, -0.2648,  0.1721],
         [-0.0311,  0.8696, -0.0472, -1.0195, -1.0602,  0.2994, -0.5326, -0.7094],
         [-0.8571,  1.7570,  0.8558,  0.3231, -0.3227, -2.3728,  0.0254,  0.2394],
         [-1.1436, -0.5389,  0.0222, -1.5995,  0.2235,  0.4767,  0.1759,  0.9656],
         [ 0.8851,  0.4446,  1.0952,  0.5664,  0.3982,  1.9940,  1.9620,  2.0426],
         [-0.6210,  0.9583, -0.7212, -0.4852,  1.8842, -0.0626, -1.4300, -0.4557],
         [-0.1287,  0.1977,  0.2854,  1.2646,  1.5819,  0.3283, -3.1076, -1.4709],
         [-0.2448, -0.3207, -0.6010,  0.1534,  0.0171, -0.8729, -1.1606,  0.4831]],
        requires_grad=True),
 tensor([[7, 3, 7, 6, 0],
         [2, 2, 4, 0, 0]]),
 tensor([[[-0.1287,  0.1977,  0.2854,  1.2646,  1.5819,  0.3283, -3.1076,
           -1.4709],
          [-0.8571,  1.7570,  0.8558,  0.3231, -0.3

# Position embedding

In [17]:
# 构造position embedding

# 定义训练编码的最大长度
max_postion_len = 5

# 在position embedding的公式中：pos表示行，i表示列
# 目标：构造出pos矩阵（每行数字相同）和i矩阵（每列数字相同）
pos_mat = torch.arange(max_postion_len).reshape((-1, 1))
pos_mat

tensor([[0],
        [1],
        [2],
        [3],
        [4]])

In [18]:
i_mat = torch.arange(0, 8, 2).reshape((1, -1)) / model_dim
i_mat

tensor([[0.0000, 0.2500, 0.5000, 0.7500]])

In [19]:
i_mat = torch.pow(10000, torch.arange(0, 8, 2).reshape((1, -1)) / model_dim)
i_mat

tensor([[   1.,   10.,  100., 1000.]])

In [20]:
pe_embedding_table = torch.zeros(max_postion_len, model_dim)
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)
pe_embedding_table 

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
          9.9980e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
          9.9955e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
          9.9920e-01,  4.0000e-03,  9.9999e-01]])

In [22]:
pe_embedding = nn.Embedding(max_postion_len, model_dim)

# 将pe_embedding_table赋值给pe_embedding.weight，并设置requires_grad为False，表示pe_embedding.weight的值不允许被梯度更新
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)  
pe_embedding_table, pe_embedding.weight

(tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00],
         [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
           9.9920e-01,  4.0000e-03,  9.9999e-01]]),
 Parameter containing:
 tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2

In [26]:
# 得到源序列的位置索引(注意：这里不能使用单词索引)
src_pos = [torch.arange(max(src_len)) for _ in src_len]
tgt_pos = [torch.arange(max(tgt_len)) for _ in tgt_len]
src_pos, tgt_pos

([tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])],
 [tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3])])

In [29]:
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len]).to(torch.int32)
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in tgt_len]).to(torch.int32)

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)
src_pe_embedding, tgt_pe_embedding

(tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
            1.0000e+00,  0.0000e+00,  1.0000e+00],
          [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
            9.9995e-01,  1.0000e-03,  1.0000e+00],
          [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
            9.9980e-01,  2.0000e-03,  1.0000e+00],
          [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
            9.9955e-01,  3.0000e-03,  1.0000e+00]],
 
         [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
            1.0000e+00,  0.0000e+00,  1.0000e+00],
          [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
            9.9995e-01,  1.0000e-03,  1.0000e+00],
          [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
            9.9980e-01,  2.0000e-03,  1.0000e+00],
          [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
            9.9955e-01,  3.0000e

# Encoder self-attention mask

In [33]:
# softmax演示: scaled的重要性
alpha1, alpha2 = 0.1, 10
score = torch.randn(5)  # score表示 Q * K 的结果
prob1 = F.softmax(score*alpha1, dim=-1)
prob2 = F.softmax(score*alpha2, dim=-1)
prob1, prob2

(tensor([0.1929, 0.2202, 0.1919, 0.1891, 0.2059]),
 tensor([1.7575e-06, 9.9876e-01, 1.0515e-06, 2.4779e-07, 1.2404e-03]))

In [None]:
def softmax_func(score):
    return F.softmax(score)

# jacobian
jaco_mat1 = torch.autograd.functional.jacobian(softmax_func, score*alpha1)
jaco_mat2 = torch.autograd.functional.jacobian(softmax_func, score*alpha2)
jaco_mat1, jaco_mat2

  return F.softmax(score)


(tensor([[ 0.1557, -0.0425, -0.0370, -0.0365, -0.0397],
         [-0.0425,  0.1717, -0.0422, -0.0416, -0.0453],
         [-0.0370, -0.0422,  0.1551, -0.0363, -0.0395],
         [-0.0365, -0.0416, -0.0363,  0.1534, -0.0389],
         [-0.0397, -0.0453, -0.0395, -0.0389,  0.1635]]),
 tensor([[ 1.7575e-06, -1.7553e-06, -1.8481e-12, -4.3548e-13, -2.1801e-09],
         [-1.7553e-06,  1.2419e-03, -1.0502e-06, -2.4748e-07, -1.2389e-03],
         [-1.8481e-12, -1.0502e-06,  1.0515e-06, -2.6056e-13, -1.3044e-09],
         [-4.3548e-13, -2.4748e-07, -2.6056e-13,  2.4779e-07, -3.0736e-10],
         [-2.1801e-09, -1.2389e-03, -1.3044e-09, -3.0736e-10,  1.2389e-03]]))

In [42]:
# 构造encoder self-attention mask
# mask的shape：[batch_size, max_src_len, max_src_len]，值为1或-inf
valid_encoder_pos = [torch.ones(L) for L in src_len]
valid_encoder_pos

[tensor([1., 1.]), tensor([1., 1., 1., 1.])]

In [43]:
# padding
valid_encoder_pos = [F.pad(torch.ones(L), (0, max(src_len)-L)) for L in src_len]
valid_encoder_pos

[tensor([1., 1., 0., 0.]), tensor([1., 1., 1., 1.])]

In [44]:
# 扩维（0 -> 1）
valid_encoder_pos = [torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) for L in src_len]
valid_encoder_pos

[tensor([[1., 1., 0., 0.]]), tensor([[1., 1., 1., 1.]])]

In [48]:
# 继续扩维
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) \
                                               for L in src_len]), 2)
valid_encoder_pos.shape

torch.Size([2, 4, 1])

In [50]:
valid_encoder_pos_matrix = torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1, 2))
valid_encoder_pos_matrix.shape, valid_encoder_pos_matrix

(torch.Size([2, 4, 4]),
 tensor([[[1., 1., 0., 0.],
          [1., 1., 0., 0.],
          [0., 0., 0., 0.],
          [0., 0., 0., 0.]],
 
         [[1., 1., 1., 1.],
          [1., 1., 1., 1.],
          [1., 1., 1., 1.],
          [1., 1., 1., 1.]]]))

In [51]:
invalid_encoder_pos_matrix = 1 - valid_encoder_pos_matrix
invalid_encoder_pos_matrix

tensor([[[0., 0., 1., 1.],
         [0., 0., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

In [52]:
# True表示需要mask，False表示不需要mask
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)
mask_encoder_self_attention

tensor([[[False, False,  True,  True],
         [False, False,  True,  True],
         [ True,  True,  True,  True],
         [ True,  True,  True,  True]],

        [[False, False, False, False],
         [False, False, False, False],
         [False, False, False, False],
         [False, False, False, False]]])

In [57]:
score = torch.randn(batch_size, max(src_len), max(src_len))
print(score.shape, mask_encoder_self_attention.shape)

torch.Size([2, 4, 4]) torch.Size([2, 4, 4])


In [59]:
masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
prob = F.softmax(masked_score, dim=-1)
src_len, score, masked_score, prob

(tensor([2, 4], dtype=torch.int32),
 tensor([[[ 0.0997,  1.0616,  1.7244, -0.0042],
          [-0.4897,  1.4075, -0.1138,  0.0997],
          [ 0.1757,  1.2486, -0.6869, -0.2622],
          [-0.2621,  1.4763,  1.1949,  0.5869]],
 
         [[ 1.5380, -1.2645,  0.3520,  0.4939],
          [-0.4976, -0.1894,  0.4387,  0.3130],
          [-1.3573,  0.2773,  1.2513,  1.7238],
          [ 0.7528, -0.0915,  0.0259,  2.3903]]]),
 tensor([[[ 9.9718e-02,  1.0616e+00, -1.0000e+09, -1.0000e+09],
          [-4.8972e-01,  1.4075e+00, -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
          [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],
 
         [[ 1.5380e+00, -1.2645e+00,  3.5200e-01,  4.9389e-01],
          [-4.9756e-01, -1.8943e-01,  4.3875e-01,  3.1299e-01],
          [-1.3573e+00,  2.7730e-01,  1.2513e+00,  1.7238e+00],
          [ 7.5282e-01, -9.1533e-02,  2.5895e-02,  2.3903e+00]]]),
 tensor([[[0.2765, 0.7235, 0.0000, 0.0000],
          [0

# Intra-attention mask

# Decoder self-attetion mask

# Multi-head self-attention