In [3]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [None]:
import torch
import numpy
import torch.nn as nn
import torch.nn.functional as F

# 关于word embedding，以序列建模为例
# 考虑source sentence 和 target sentence
# 构建序列，序列的字符以其在词表中的索引的形式表示

# 先把目标序列和源序列的长度假设一个值，然后再根据这个长度再去随机生成单词的索引
batch_size = 2

# 单词表大小
max_num_src_words = 8
max_num_tgt_words = 8
model_dim = 8 # 原论文中是512

# 序列的最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5

#src_len = torch.randint(2, 5, (batch_size, ))
#tgt_len = torch.randint(2, 5, (batch_size, ))

src_len = torch.Tensor([2, 4]).to(torch.int32)
tgt_len = torch.Tensor([4, 3]).to(torch.int32)

# 单词索引(Token ID)构成源句子和目标句子，并且做了padding，默认值为0
src_seq = [torch.randint(1, max_num_src_words, (L,)) for L in src_len]
src_seq_pad = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_src_words, (L,)), (0, max_src_seq_len -L)), 0) for L in src_len], dim=0) 
tgt_seq = [torch.randint(1, max_num_tgt_words, (L,)) for L in tgt_len]
tgt_seq_pad = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max_tgt_seq_len -L)), 0) for L in tgt_len], dim=0)

print("Before padding:", '\n', src_seq)
print("After padding: ", '\n', src_seq_pad)

print("-"*100)

print("Before padding:", '\n', tgt_seq)
print("After padding:", '\n', tgt_seq_pad)

print("-"*100)


# 构造embedding
"""为什么要+1：
  在创建嵌入表时，通常需要为每个可能的索引（包括0）提供一个嵌入向量。
  这里的 max_num_src_words 表示源句子中可能出现的最大不同单词的数量。
  如果我们只创建 max_num_src_words 个嵌入向量，那么我们就没有为索引0提供嵌入向量。
  在许多情况下，索引0用于表示特殊的“填充”（padding）标记，这是在处理变长序列时常用的技术。"""
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim) 
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim)
print(src_embedding_table.weight)
print("The size of src_embedding_table.weight is: ", src_embedding_table.weight)
# https://www.bilibili.com/video/BV1cP4y1V7GF?t=2112.4


Before padding: 
 [tensor([6, 3]), tensor([3, 2, 1, 7])]
After padding:  
 tensor([[4, 4, 0, 0, 0],
        [2, 4, 5, 7, 0]])
----------------------------------------------------------------------------------------------------
Before padding: 
 [tensor([3, 4, 5, 1]), tensor([7, 1, 5])]
After padding: 
 tensor([[4, 5, 7, 6, 0],
        [5, 1, 3, 0, 0]])
----------------------------------------------------------------------------------------------------
Parameter containing:
tensor([[ 0.3933, -1.3552,  0.3834, -0.2992, -0.2086,  1.1328, -0.6057, -0.0127],
        [ 0.8701, -0.0343, -0.5782, -0.0905,  2.0679, -1.6769, -0.9496,  0.4667],
        [ 0.3211,  0.5684, -1.4334,  0.3410, -0.5727,  0.5528,  0.2761, -0.9537],
        [-3.3242, -0.6343,  0.8966, -0.1813, -2.4476,  1.9957, -0.8261, -0.8904],
        [ 0.0540, -1.0303,  1.0239,  0.5747,  0.3174, -1.3341,  1.1541, -1.6357],
        [ 0.1257,  1.6404, -1.4771,  0.6476,  0.6719,  0.5688,  0.9016,  1.1626],
        [ 0.9428, -0.2309, -0.

In [None]:
# torch.nn.Embedding()

import torch
import torch.nn as nn

embedding = nn.Embedding(10, 3) # 定义一个embedding模块，包含了一个长度为10的张量，每个张量的大小是3
# print(embedding)
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
e = embedding(input)
print(e) # e.shape: [2, 4, 3], 在经过nn.embedding后，从[2, 4]维度变换为[2, 4, 3]，其实就是[2, 4]中的每个值作为索引去nn.embedding中取对应的权重
print("The shape of e is: ", e.shape)
print(e[1][3]) # [0.3204,  1.6376, -0.4442]，相应的3即为embedding后的权重
print("-"*100)
print(embedding.weight) 
print("The shape of embedding.weight is: ", embedding.weight.shape)

tensor([[[-1.4551,  0.3129,  1.1398],
         [-2.1301,  0.4180,  0.4204],
         [-0.1716, -1.8632,  0.3244],
         [-0.5084, -1.6552, -1.8416]],

        [[-0.1716, -1.8632,  0.3244],
         [ 0.6709,  0.4043, -2.2828],
         [-2.1301,  0.4180,  0.4204],
         [ 0.3204,  1.6376, -0.4442]]], grad_fn=<EmbeddingBackward0>)
The shape of e is:  torch.Size([2, 4, 3])
tensor([ 0.3204,  1.6376, -0.4442], grad_fn=<SelectBackward0>)
----------------------------------------------------------------------------------------------------
Parameter containing:
tensor([[-1.7047,  0.8644, -0.4895],
        [-1.4551,  0.3129,  1.1398],
        [-2.1301,  0.4180,  0.4204],
        [ 0.6709,  0.4043, -2.2828],
        [-0.1716, -1.8632,  0.3244],
        [-0.5084, -1.6552, -1.8416],
        [-1.5301,  0.2444, -0.6057],
        [ 2.0089,  1.4314,  0.9333],
        [-2.0367,  0.4912,  0.7898],
        [ 0.3204,  1.6376, -0.4442]], requires_grad=True)
The shape of embedding.weight is:  torch.Si

In [None]:
# encoder self-attention

import torch

seq_len = 5
batch_size = 2
attention_mask = torch.zeros((batch_size, seq_len)) # 全0矩阵，表示没有padding
# print(attention_mask)

attention_mask[:, 3:] = 1 # 将需要padding的位置设置成1
# print(attention_mask)

extended_attention_mask = attention_mask.unsqueeze(1)

print(extended_attention_mask)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])
tensor([[0., 0., 0., 1., 1.],
        [0., 0., 0., 1., 1.]])
tensor([[[0., 0., 0., 1., 1.]],

        [[0., 0., 0., 1., 1.]]])


In [None]:
#