# Positonal embeddings：词位置编码

词嵌入大综合 embedding with position

## 前置 - 读取输入

In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
file_path = '../../input/the-verdict.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    raw_text = file.read()

## 数据采样：Token IDs 转 Token embeddings

* 使用 tiktoken 直接将 raw_text 编码为 token_id
* 使用词嵌入 将 token_id 转换为 token_embeddings

### 代码 - 已实现的数据采样

使用 tiktoken 将 Input 转换为 Token IDs

In [3]:
# 之前实现的 data_sampling：

import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

# Pytorch的Dataset的子类
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            # 将原始数据转换为 PyTorch 张量，享受到 PyTorch 提供的各种优化和功能
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, bath_size = 4,
        max_length = 256, stride = 128, shuffle=True, drop_last=True):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, batch_size=bath_size, shuffle=shuffle, drop_last=drop_last)
    return dataloader

### 执行数据采样

In [4]:
dataloader = create_dataloader_v1(raw_text, bath_size=8,
        max_length=4, stride=4)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("inputs:\n", inputs)
print("inputs shape:\n", inputs.shape)
print("targets:\n", targets)

inputs:
 tensor([[  673,  1908,   329,   345],
        [ 5779,    11,   314,  1816],
        [40559, 11959,  1636,    11],
        [  262,  5739,  1444,   510],
        [   13,   764,   764,   764],
        [  290,  8104,   465,  1021],
        [  465, 13476,    11,   339],
        [  760,   526,   198,   198]])
inputs shape:
 torch.Size([8, 4])
targets:
 tensor([[ 1908,   329,   345,  1701],
        [   11,   314,  1816,   572],
        [11959,  1636,    11,   508],
        [ 5739,  1444,   510,   477],
        [  764,   764,   764,  3894],
        [ 8104,   465,  1021,   319],
        [13476,    11,   339,   550],
        [  526,   198,   198,  3347]])


### 词嵌入：特征可计算

使用token_token_embedding_layer将这些token id嵌入为256维的向量

In [5]:
# 更真实的编码情况

import torch

vocab_size = 50257 # 词汇表大小
output_dim = 256 # 编码大小

# 设计随机种子，确保Embedding实验结果的可重复性
torch.manual_seed(123)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [6]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


## 词位置编码：Positonal embeddings

根据8\*4\*256的张量维度，创建相同维度的嵌入层

In [7]:
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

# torch.arange(4) = [0, 1, 2, 3]
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings)
print(pos_embeddings.shape)

tensor([[ 0.5423, -0.1224, -1.4150,  ...,  0.2515, -2.3067,  0.8155],
        [-0.3973, -1.2575, -1.9800,  ..., -0.1207,  0.3075, -0.6422],
        [ 0.1840,  1.1128,  1.0052,  ...,  0.2081,  0.5531, -1.1619],
        [ 1.4155,  0.6599,  0.3760,  ...,  0.7034, -0.6108,  0.1080]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([4, 256])


In [8]:
# 位置编码嵌入：
input_embeddings = token_embeddings + pos_embeddings
print(token_embeddings.shape)
print(pos_embeddings.shape)
print(input_embeddings.shape)

torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])
