词嵌入大综合embedding with position

In [6]:
# 更真实的编码情况

import torch

vocab_size = 50257 # 词汇表大小
output_dim = 256 # 编码大小

# 设计随机种子，确保Embedding实验结果的可重复性
torch.manual_seed(123)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [7]:
# 之前实现的data_sampling：

import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

# Pytorch的Dataset的子类
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            # 将原始数据转换为 PyTorch 张量，享受到 PyTorch 提供的各种优化和功能
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, bath_size = 4,
        max_length = 256, stride = 128, shuffle=True, drop_last=True):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, batch_size=bath_size, shuffle=shuffle, drop_last=drop_last)
    return dataloader

In [8]:
tokenizer = tiktoken.get_encoding("gpt2")
file_path = '../input/the-verdict.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    raw_text = file.read()

In [9]:
dataloader = create_dataloader_v1(raw_text, bath_size=8,
        max_length=4, stride=4)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("inputs:\n", inputs)
print("inputs shape:\n", inputs.shape)
print("targets:\n", targets)

inputs:
 tensor([[31640,    12,    67, 20811],
        [   11,   262,  7888,  7586],
        [  616, 20348,   287,  5963],
        [  257, 29844,   286, 12749],
        [   11,  2087,   329,   616],
        [26394,    12,   301,   971],
        [22474,    62,  4964,   502],
        [ 1517,   339,   550,  1760]])
inputs shape:
 torch.Size([8, 4])
targets:
 tensor([[   12,    67, 20811,     1],
        [  262,  7888,  7586,  9813],
        [20348,   287,  5963,    11],
        [29844,   286, 12749,    11],
        [ 2087,   329,   616, 35957],
        [   12,   301,   971,    13],
        [   62,  4964,   502,    11],
        [  339,   550,  1760,   438]])


使用token_token_embedding_layer将这些token id嵌入为256维的向量

In [10]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


位置编码

根据8\*4\*256的张量维度，创建相同维度的嵌入层

In [12]:
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# torch.arange(4) = [0, 1, 2, 3]
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings)
print(pos_embeddings.shape)

tensor([[ 1.8302, -0.9019, -0.7203,  ...,  0.1260,  0.2663, -2.1815],
        [ 1.3574, -1.8919,  0.2680,  ..., -1.0383, -0.5890,  1.1063],
        [-0.2980, -0.1013,  2.1897,  ..., -1.3644,  0.7289, -0.5926],
        [ 0.9600, -1.0767,  0.3717,  ..., -0.0075,  0.7468, -0.7247]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([4, 256])


In [15]:
# 位置编码嵌入：
input_embeddings = token_embeddings + pos_embeddings
print(token_embeddings.shape)
print(pos_embeddings.shape)
print(input_embeddings.shape)

torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])
