##单词嵌入，作为GPT的数据处理关键技术，本质是将文字非数值数据转换成张量。

In [2]:
with open("The_Verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
print("The number of character:",len(raw_text))


The number of character: 21941


##制作token
第一步：分词

In [4]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:10])

['The', 'Verdict', 'Edith', 'Wharton', '1908', 'Exported', 'from', 'Wikisource', 'on', 'February']


第二步：制做token ID 

In [6]:
vocab = {token: index for index , token in enumerate(preprocessed)} 
print(len(vocab))
for i, item in enumerate(vocab.items()): 
    print(item) 
    if i > 10: break

1235
('The', 4897)
('Verdict', 1)
('Edith', 2)
('Wharton', 3)
('1908', 4)
('Exported', 5)
('from', 4810)
('Wikisource', 4862)
('on', 4651)
('February', 9)
('14', 10)
(',', 4878)


不仅需要encode还需要decode，所以还需要一个id转换成token的函数

In [8]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab                                                   #A
        self.int_to_str = {i:s for s,i in vocab.items()}                          #B

    def encode(self, text):                                                       #C
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):                                                        #D
        text = " ".join([self.int_to_str[i] for i in ids])
        ##"".jion 函数会在拼接时加入空格所以需要使用sub()去除空格
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)                           #E
        return text


In [9]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)
text = tokenizer.decode(ids)
print(text)

[4702, 4422, 4880, 4881, 4871, 3883, 4619, 4521, 4878, 4334, 4236, 4878, 4702, 4491, 4896, 2936, 4396, 4739, 1981, 2193, 4896]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


我们需要添加两个特殊的token 
<|unk|>：该单词不在词汇表中
<|endoftext|>：用来连接不同来源的文本

In [11]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab))
for i, item in enumerate(list(vocab.items())[-5:]):
  	print(item)


1237
('younger', 1232)
('your', 1233)
('yourself', 1234)
('<|endoftext|>', 1235)
('<|unk|>', 1236)


In [12]:
# Listing 2.4 A simple text tokenizer that handles unknown words
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int                    #A
                        else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)                    #B
        return text

In [13]:
tokenizer = SimpleTokenizerV2(vocab)
text = "hello world, this is a simple tokenizer"
ids = tokenizer.encode(text)
print(ids)
text = tokenizer.decode(ids)
print(text)

[1236, 1236, 6, 1098, 661, 164, 1236, 1236]
<|unk|> <|unk|>, this is a <|unk|> <|unk|>


更加复杂的编码方式，Byte pair encoding是用于GPT2,GPT3早期的编码方案,BPE说白了就是将单词拆开为高频字母对，这样就可以提高encoding的单词表，在训练的时候即使遇到陌生词也会根据单词构造例如词根进行猜测词义

In [23]:
#!pip install tiktoken
import tiktoken ##version:0.9.0

In [16]:
tokenizer = tiktoken.get_encoding("gpt2")##this encoding is samilar V2
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(ids)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [17]:
text = tokenizer.decode(ids)
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


滑动窗口取样 这个技术就是在生成的token的训练数据集上提取input-target pair 的技术

In [19]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5560


In [20]:
enc_sample = enc_text[50:]
context_size = 4                    #A
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

x: [7026, 15632, 438, 2016]
y:      [15632, 438, 2016, 257]
[7026] ----> 15632
[7026, 15632] ----> 438
[7026, 15632, 438] ----> 2016
[7026, 15632, 438, 2016] ----> 257
 cheap ---->  genius
 cheap genius ----> --
 cheap genius-- ----> though
 cheap genius--though ---->  a


接下来我们需要补充一点pytorch的相关知识，tensor和dataset数据类型
dataset 是用来储存特征-标签的数据类型
dataloader 是用来将dataset制作成batch用于模型训练

In [22]:
#!pip3 install torch torchvision torchaudio



In [5]:
import torch
from torch.utils.data import Dataset,DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):              
        #max_length 是每次读取文本的最大长度
        #stride 是步长也就是滑动窗口的大小
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)                                #A

        for i in range(0, len(token_ids) - max_length, stride):          #B
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):                                                     #C
        return len(self.input_ids)

    def __getitem__(self, idx):                                            #D
    	return self.input_ids[idx], self.target_ids[idx]
#A 将整个文本进行分词
#B 使用滑动窗口将书籍分块为最大长度的重叠序列。
#C 返回数据集的总行数
#D 从数据集中返回指定行

In [17]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")                       #A
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)      #B
    dataloader = DataLoader(
        dataset,
      	batch_size=batch_size,
      	shuffle=shuffle,
      	drop_last=drop_last,                                        #C
      	num_workers=0                                               #D
    )

    return dataloader

#A 初始化分词器
#B 创建GPTDatasetV1类
#C drop_last=True会在最后一批次小于指定的batch_size时丢弃该批次，以防止训练期间的损失峰值
#D 用于预处理的CPU进程数量

In [25]:
with open("The_Verdict.txt", "r", encoding="utf-8") as f:
		raw_text = f.read()
dataloader = create_dataloader_v1(
  	raw_text, batch_size=4, max_length=4, stride=1, shuffle=False)
first_batch = next(iter(dataloader))
print(first_batch)

[tensor([[  464,  4643, 11600,   628],
        [ 4643, 11600,   628,   198],
        [11600,   628,   198,   197],
        [  628,   198,   197,   197]]), tensor([[ 4643, 11600,   628,   198],
        [11600,   628,   198,   197],
        [  628,   198,   197,   197],
        [  198,   197,   197,   197]])]


我们在上面已将单词转换为一个个的token ids，但是在传输给下一层神经网络之前，我们需要讲token id转换为嵌入向量，这也就是Embedding层需要做的事。原理其实也非常简单就是嵌入也有个单词表，每个单词对应一个n维的向量（n是由用户指定的），嵌入层会根据token id生成指定的顺序的向量。将token转换为向量的目的：
1.向量可以理解近义词和相似词
2.向量可以考虑句子和段落的上下文关系
3.因为LLM是神经网络，我们需要向量数值计算梯度以及通过反向传播算法来更新梯度

In [13]:
#假设输入的句子token ids
input_ids = torch.tensor([2, 3, 5, 1])
#指定的单词表大小即一共有6个单词的单词表
vocab_size = 6
#指定每个单词的向量维度
output_dim = 3
torch.manual_seed(123)
#创建嵌入层
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
#输出的便是向量单词表
print(embedding_layer.weight)
#应用嵌入层生成该句子的向量
print(embedding_layer(input_ids))


Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [31]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[  464,  4643, 11600,   628],
        [  198,   197,   197,   197],
        [  197,   197,  7407,   342],
        [  854, 41328,   628,   628],
        [  198,   198,  1129,  2919],
        [  628,   628,   198,   198],
        [ 3109,  9213,   422, 11145],
        [  271,  1668,   319,  3945]])

Inputs shape:
 torch.Size([8, 4])


In [35]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [37]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([4, 256])
torch.Size([8, 4, 256])


向量需要添加位置编码，这对后来的上下文理解非常重要。主要有以下位置编码：
1.相对位置
2.绝对位置
以下是个练习使用BERT对一串文本进行预处理

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.2-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   --------- ------------------------------ 2.4/10.0 MB 12.2 MB/s eta 0:00:01
   ------------------ --------------------- 4.7/10.0 MB 11.9 MB/s eta 0:00:01
   ---------------------------- ----------- 7.1/10.0 MB 11.8 MB/s eta 0:00:01
   -------------------------------------- - 9.7/10.0 MB 11.8 MB/s eta 0:00:01
   ---------------------------------------- 10.0/10.0 MB 11.7 MB/s eta 0:00:00
Downl

In [15]:
text = "Holle world, nice to meet you!"
import torch
from transformers import BertTokenizer, BertModel
#加载BERT 预训练的tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#加载bert的预训练模型因为bert的嵌入层集成在model中
model = BertModel.from_pretrained('bert-base-uncased')
# 将文本转为 token ids
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
# 将 token ids 转换为张量，并增加 batch 维度
input_ids = torch.tensor([token_ids])

# 打印 tokens 和 token ids
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
# 获取 BERT 模型的输出（嵌入向量）
with torch.no_grad():
    outputs = model(input_ids)
# 获取 token 嵌入
token_embeddings = outputs[0]
print(f"Token Embeddings Shape: {token_embeddings.shape}")

# 获取位置嵌入
embedding_layer = model.embeddings
position_embeddings = embedding_layer.position_embeddings(torch.arange(input_ids.size(1), dtype=torch.long).unsqueeze(0))
input_embeddings = token_embeddings + position_embeddings
print(f"Input Embeddings Shape: {input_embeddings.shape}")

Tokens: ['ho', '##lle', 'world', ',', 'nice', 'to', 'meet', 'you', '!']
Token IDs: [7570, 6216, 2088, 1010, 3835, 2000, 3113, 2017, 999]
Token Embeddings Shape: torch.Size([1, 9, 768])
Input Embeddings Shape: torch.Size([1, 9, 768])
