In [90]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import random

# 关于 word-embedding，以序列建模为例
# 考虑 source-sentence 和 target-sentence （暂时考虑离散结果）
# 首先构建序列，序列的字符以其在词表中的索引的形式表示出来，首先构建source序列和target序列

# seed = 666
# torch.manual_seed(seed)


# 原序列目标单词的最大原子数（单词表大小）
max_num_src_words = 8
max_num_tgt_words = 8


# 定义特征大小 原API中是512
model_dim = 8


# 定义单词表的最大序列长度
max_src_seq_len = 5
max_tgt_seq_len = 5
max_position_len = 5

batch_size = 2 # 原序列的大小
# src_len = torch.randint(2, 5, (batch_size, ))  # 参数：最小值，最大值，数据类型（数据格式）
# tgt_len = torch.randint(2, 5, (batch_size, ))
src_len = torch.Tensor([2, 4]).to(torch.int32)  # 我们拟造的原序列的长度
tgt_len = torch.Tensor([4, 3]).to(torch.int32)  # 定义的目标序列长度

# print(src_len)  # tensor([2, 4], dtype=torch.int32)
# print(tgt_len)  # tensor([4, 3], dtype=torch.int32)


# 生成原序列 这是以单词索引构成的句子 构建batch
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max(src_len) - L)), 0) for L in src_len]) 
# [tensor([3, 3, 0, 0, 0]), tensor([2, 2, 7, 3, 0])]  这只是第一步我们要做的，然后就是将两个独立的tensor合并成一个tensor，使用到了cat
# 合并，然后对于每一个Tensor我们在第0维升维  tensor([[3, 5, 0, 0, 0], [3, 6, 4, 1, 0]])


# 生成目标序列 并且按照原序列的形式进行padding 默认值是0
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max(src_len) - L)), 0) for L in tgt_len])  
# tensor([[1, 6, 1, 4, 0], [1, 5, 4, 0, 0]])

# print(src_seq)
# print(tgt_seq)


# 构造embedding （source embedding 和target embedding）
# 在这里+1的含义是我们padding了0，所以要预留出0的dim
src_embedding_table = nn.Embedding(max_num_src_words +1, model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words +1, model_dim)

src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)


# 构造position embedding
position_matrix = torch.arange(max_position_len).reshape((-1, 1))  # PE 两大参数
i_matrix = torch.pow(10000, torch.arange(0, model_dim, 2).reshape((1, -1))/model_dim)

pe_embedding_table = torch.zeros(max_position_len, model_dim)
pe_embedding_table[:, ::2] = torch.sin(position_matrix / i_matrix)  # PE公式中偶数列sin的构建
pe_embedding_table[:, 1::2] = torch.cos(position_matrix / i_matrix)  # # PE公式中奇数列cos的构建

# 可以利用nn.enbedding来实现快速构建效果
pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)
# print(pe_embedding_table)  # 结果应该是和上面一致的

src_position = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len]).to(torch.int32)
tgt_position = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in tgt_len]).to(torch.int32)

# print(src_position)
src_pe_embedding = pe_embedding(src_position)
tgt_pe_embedding = pe_embedding(tgt_position)

# print(src_pe_embedding)  # 这结果的size是2*4*8 意味着batch_size是2 sequence_length是4 dim是8
# print(tgt_pe_embedding)

# softmax demo
# 首先随机生成一个正态分布来观察分数
alpha1 = 0.1
alpha2 = 10  # fine-tune 的可能影响
score = torch.randn(5) # 假设是Attention(Q, K, V)中QK点集的结果
prob1 = F.softmax(score*alpha1, -1)
prob2 = F.softmax(score*alpha2, -1)
def soft_max(score):
    return F.softmax(score)
# 查看雅可比矩阵
# jacobian1 = torch.autograd.functional.jacobian(soft_max, score*alpha1)
# jacobian2 = torch.autograd.functional.jacobian(soft_max, score*alpha2)
# print(jacobian1) # alpha=1 的时候梯度比较稳定没有消失
# print(jacobian2)  # 好多地方都要变成了0，梯度消失
# print(prob1)  # tensor([0.1489, 0.1668, 0.0710, 0.4117, 0.2015]) Prob越大代表着两个单词之间的相似性越大
# print(prob2)  # tensor([4.7223e-08, 2.4097e-08, 3.3442e-04, 4.3119e-05, 9.9962e-01]) 

# 构造encoder self-attention mask   mask的shape:[batch_size, max_src_len, max_src_len]，值为1或-inf
# 首先构建有效的编码器
valid_encoder_position = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) for L in src_len]), 2)
# print(valid_encoder_position)
# print(valid_encoder_position.shape)  # torch.Size([2, 4, 1]) 这个size的含义是batch_size * padding_sentence_length * padding 


valid_encoder_position_matrix = torch.bmm(valid_encoder_position, valid_encoder_position.transpose(1, 2))  # [2, 4, 1] * [2, 1, 4]  4*4
# print(valid_encoder_position_matrix.shape)
# print(valid_encoder_position_matrix)  # torch.Size([2, 4, 4])


# 创建无效矩阵invalid 然后相减就变成了valid
invalid_encoder_position_matrix = 1- valid_encoder_position_matrix
mask_encoder_self_attention = invalid_encoder_position_matrix.to(torch.bool)
# print(mask_encoder_self_attention.shape)  # torch.Size([2, 4, 4])

# 模拟成绩
score = torch.randn(batch_size, max(src_len), max(src_len))
# print(score.shape)  # torch.Size([2, 4, 4])
masked_score = score.masked_fill(mask_encoder_self_attention, -1e10)
prob = F.softmax(masked_score, -1)

print(src_len)
print(score)
print(masked_score)
print(prob)

tensor([2, 4], dtype=torch.int32)
tensor([[[ 1.5234, -0.6785, -0.7149,  0.0164],
         [ 0.1760, -0.2667, -0.1889,  1.5221],
         [ 0.4881,  0.8749, -0.9304,  0.3813],
         [-0.5346,  0.8688,  0.9247, -0.3984]],

        [[ 0.0555,  2.4331,  1.4201,  1.6171],
         [-0.6059,  0.4174,  1.1163, -0.5960],
         [-0.1111,  0.7496,  1.1567, -2.0045],
         [ 1.0634,  1.4220,  0.0287,  0.5448]]])
tensor([[[ 1.5234e+00, -6.7847e-01, -1.0000e+10, -1.0000e+10],
         [ 1.7597e-01, -2.6667e-01, -1.0000e+10, -1.0000e+10],
         [-1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10],
         [-1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10]],

        [[ 5.5481e-02,  2.4331e+00,  1.4201e+00,  1.6171e+00],
         [-6.0591e-01,  4.1739e-01,  1.1163e+00, -5.9597e-01],
         [-1.1109e-01,  7.4956e-01,  1.1567e+00, -2.0045e+00],
         [ 1.0634e+00,  1.4220e+00,  2.8717e-02,  5.4480e-01]]])
tensor([[[0.9004, 0.0996, 0.0000, 0.0000],
         [0.6089, 0.3911, 0.0000, 0