In [37]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

# wordembedding 以序列建模为例
batch_size = 2

# 单词表大小
max_num_src_words = 8
max_num_tgt_words = 8
model_dim = 8

# 序列的最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5
max_position_len = 5

src_len = torch.Tensor([2, 4]).to(torch.int32)  # 源序列长度
tgt_len = torch.Tensor([4, 3]).to(torch.int32)  # 目标序列长度

# 以单词索引构成的源句子和目标句子，构建batch，并且做了padding，默认值为0
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max(src_len)-L)), 0) for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max(tgt_len)-L)), 0) for L in tgt_len])

# 构造word embedding 第0行：padding的embedding
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim)
src_embedding = src_embedding_table(src_seq) 
tgt_embedding = tgt_embedding_table(tgt_seq)

# 构造position embedding
pos_mat = torch.arange(max_position_len).reshape((-1, 1))
i_mat = torch.pow(10000, torch.arange(0, model_dim, 2).reshape((1, -1))/model_dim)
pe_embedding_table = torch.zeros((max_position_len, model_dim))
pe_embedding_table[:, 0::2] = torch.sin(pos_mat / i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat / i_mat)

# 构造位置embedding
pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)

# 生成位置索引
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len]).to(torch.int32) 
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in tgt_len]).to(torch.int32)

# 构造每个位置的位置embedding
src_pe_embedding = pe_embedding(src_pos)  
tgt_pe_embedding = pe_embedding(tgt_pos)

# 构造encoder的self-attention mask，保证注意力不会分配给padding的部分
# mask的shape：[batch_size, max_src_len, max_src_len]，值为1或-inf
vaild_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) for L in src_len]), 2)
vaild_encoder_pos_matrix = torch.bmm(vaild_encoder_pos, vaild_encoder_pos.transpose(1, 2))
invalid_encoder_pos_matrix = 1 - vaild_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)

# 模拟encoder的self-attention score
score = torch.rand((batch_size, max(src_len), max(src_len)))

masked_score = score.masked_fill(mask_encoder_self_attention, -np.inf)
prob = F.softmax(masked_score, -1)

print(src_len)
print(score)
print(masked_score)
print(prob)



tensor([2, 4], dtype=torch.int32)
tensor([[[0.4958, 0.4058, 0.1182, 0.5791],
         [0.0496, 0.2688, 0.8308, 0.2501],
         [0.8285, 0.4376, 0.9953, 0.0588],
         [0.2969, 0.7258, 0.6915, 0.2288]],

        [[0.1320, 0.4623, 0.4608, 0.3693],
         [0.0656, 0.2772, 0.3697, 0.7058],
         [0.5381, 0.8132, 0.0411, 0.1388],
         [0.9907, 0.0395, 0.6499, 0.4306]]])
tensor([[[0.4958, 0.4058,   -inf,   -inf],
         [0.0496, 0.2688,   -inf,   -inf],
         [  -inf,   -inf,   -inf,   -inf],
         [  -inf,   -inf,   -inf,   -inf]],

        [[0.1320, 0.4623, 0.4608, 0.3693],
         [0.0656, 0.2772, 0.3697, 0.7058],
         [0.5381, 0.8132, 0.0411, 0.1388],
         [0.9907, 0.0395, 0.6499, 0.4306]]])
tensor([[[0.5225, 0.4775, 0.0000, 0.0000],
         [0.4454, 0.5546, 0.0000, 0.0000],
         [   nan,    nan,    nan,    nan],
         [   nan,    nan,    nan,    nan]],

        [[0.1981, 0.2756, 0.2752, 0.2511],
         [0.1822, 0.2252, 0.2470, 0.3456],
         [