In [2]:
import torch
import torch.nn as nn
device = "cuda:0"

## 一、词元嵌入模型

### 模型实现

In [21]:
## 将输入ID序列转换为嵌入向量。
class TokenEmbedding(nn.Embedding):
    """模型参数:
    vocab_size: 字典词元个数。
    d_mode: token的维度。
    """
    def __init__(self, vocab_size, d_model, device="cpu", padding_idx=1):
        
        super(TokenEmbedding, self).__init__(vocab_size, d_model, 
                                             padding_idx=padding_idx, device=device)

In [22]:
vocab_size = 100; d_model=64; n_tokens = 10; n_sentences = 3
token_embedding = TokenEmbedding(vocab_size, d_model, device=device)

In [23]:
input_ids = torch.randint(0, vocab_size, size=(n_sentences, n_tokens), device=device)
input_ids

tensor([[10, 90, 40, 27, 24, 38, 47,  9, 61, 55],
        [ 7,  6, 37,  4,  8,  2, 90, 78, 34, 86],
        [92, 18, 34,  1, 50, 99, 24, 84, 45, 63]], device='cuda:0')

In [6]:
with torch.no_grad():
    embedding_matrix = token_embedding(input_ids)
embedding_matrix.size()

torch.Size([3, 10, 64])

### 嵌入的细节

In [7]:
import torch.nn.functional as F

vocab_size = 100; d_model=64; n_tokens = 10; n_sentences = 1
## token_embedding.weight.size() = torch.Size([100, 64])

input_ids = torch.randint(0, vocab_size, size=(1, 3), device=device).flatten()
## input_ids.size() = torch.Size([3])


one_hot = F.one_hot(input_ids, num_classes=vocab_size).to(torch.float)
print(one_hot.size())
torch.equal(one_hot @ token_embedding.weight, token_embedding(input_ids))
## tensor(True, device='cuda:0')

torch.Size([3, 100])


True

 ## 二、位置嵌入模型

### 模型实现

In [26]:
class PositionEncoding(nn.Module):

    def __init__(self, d_model, max_length, device='cpu'):
        super(PositionEncoding, self).__init__()
        
        ## 对输入序列的位置编码
        self.encoding = torch.zeros(max_length, d_model, device=device, 
                                    requires_grad=False)
        
        pos = torch.arange(0, max_length, device=device)
        pos = pos.float().unsqueeze(dim=1)

        ## 对token维度的编码：对每个维度添加位置编码，i对应第i个维度，共d_model个维度。
        _2i = torch.arange(0, d_model, step=2, device=device).float()

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
    
    def forward(self, x):
        batch_size, seq_size = x.size()

        return self.encoding[:seq_size, :]

In [20]:
d_model = 128; max_length = 128
position_embedding = PositionEncoding(d_model, max_length, device)

vocab_size = 100; n_tokens = 10; n_sentences = 1
x = torch.randint(low=0, high=vocab_size, size=(n_sentences, n_tokens), device=device)
position_encoding = position_embedding(x)
position_encoding.size()

torch.Size([10, 128])

## 三、Transformer的编码器

In [27]:

class TransformerEmbedding(nn.Module):

    def __init__(self, vocab_size, d_model, max_length, drop_prob=0.1, 
                 device="cpu", padding_idx=-1):
        super(TransformerEmbedding, self).__init__()

        self.token_embedding = TokenEmbedding(vocab_size, d_model, 
                                              device=device, 
                                              padding_idx=padding_idx)
        self.pos_encoding = PositionEncoding(d_model, max_length, device)
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, input_ids):
        token_emb = self.token_embedding(input_ids)
        pos_encoding = self.pos_encoding(input_ids)
        return self.dropout(token_emb + pos_encoding)

In [29]:
vocab_size = 100; d_model = 128; max_length = 100; n_tokens = 32; n_sentences = 3
tf_emb = TransformerEmbedding(
    vocab_size=vocab_size,
    d_model=d_model,
    max_length=max_length,
    device=device
)

In [30]:
input_ids = torch.randint(0, vocab_size, size=(n_sentences, n_tokens), device=device)