In [1]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch05", "09_extending-tokenizers")
print(file_path)
sys.path.append(file_path)

import torch
if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch05/09_extending-tokenizers


In [2]:
import tiktoken

base_tokenizer = tiktoken.get_encoding("gpt2")
sample_text = "Hello, MyNewToken_1 is a new token. <|endoftext|>"

token_ids = base_tokenizer.encode(sample_text, allowed_special={"<|endoftext|>"})
print(token_ids)

[15496, 11, 2011, 3791, 30642, 62, 16, 318, 257, 649, 11241, 13, 220, 50256]


In [3]:
for token_id in token_ids:
    print(f"{token_id} -> '{base_tokenizer.decode([token_id])}'")

15496 -> 'Hello'
11 -> ','
2011 -> ' My'
3791 -> 'New'
30642 -> 'Token'
62 -> '_'
16 -> '1'
318 -> ' is'
257 -> ' a'
649 -> ' new'
11241 -> ' token'
13 -> '.'
220 -> ' '
50256 -> '<|endoftext|>'


# 1. 添加特殊的token

In [4]:
custom_tokens = ["MyNewToken_1", "MyNewToken_2"]
custom_token_ids = {
    token: base_tokenizer.n_vocab + i for i, token in enumerate(custom_tokens)
}

In [6]:
extended_tokenizer = tiktoken.Encoding(
    name="gpt2_custom",
    pat_str=base_tokenizer._pat_str,
    mergeable_ranks=base_tokenizer._mergeable_ranks,
    special_tokens={**base_tokenizer._special_tokens, **custom_token_ids}
)

In [7]:
special_tokens_set = set(custom_tokens) | {"<|endoftext|>"}

token_ids = extended_tokenizer.encode(
    "Sample text with MyNewToken_1 and MyNewToken_2. <|endoftext|>",
    allowed_special=special_tokens_set
)

In [8]:
for token_id in token_ids:
    print(f"{token_id} -> '{extended_tokenizer.decode([token_id])}'")

36674 -> 'Sample'
2420 -> ' text'
351 -> ' with'
220 -> ' '
50257 -> 'MyNewToken_1'
290 -> ' and'
220 -> ' '
50258 -> 'MyNewToken_2'
13 -> '.'
220 -> ' '
50256 -> '<|endoftext|>'


# 2. 更新预训练的LLM

## 2.1 加载预训练的GPT

In [9]:
from gpt_download import download_and_load_gpt2

settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")



File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [10]:
# Relative import from the gpt_download.py contained in this folder
from previous_chapters import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-small (124M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

gpt = GPTModel(NEW_CONFIG)
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

## 2.2 使用预训练过的GPT

In [11]:
sample_text = "Sample text with MyNewToken_1 and MyNewToken_2. <|endoftext|>"

original_token_ids = base_tokenizer.encode(
    sample_text, allowed_special={"<|endoftext|>"}
)

In [12]:
new_token_ids = extended_tokenizer.encode(
    "Sample text with MyNewToken_1 and MyNewToken_2. <|endoftext|>",
    allowed_special=special_tokens_set
)

In [13]:
import torch

# 由于未更新emb，后导致报错
with torch.no_grad():
    out = gpt(torch.tensor([original_token_ids]))

print(out)

tensor([[[ 0.5708, -0.6516,  0.0559,  ..., -1.0257,  0.2292,  0.1084],
         [ 0.4205,  0.1292,  0.0429,  ...,  0.8149, -0.1634, -0.2479],
         [ 0.0087,  0.3251, -1.1767,  ..., -1.9761, -0.9104, -0.1551],
         ...,
         [ 0.9823, -0.0937, -0.7484,  ...,  0.3498,  0.0624,  0.2992],
         [ 0.0432,  0.1954, -0.3456,  ..., -0.7580,  0.3674, -0.0850],
         [ 0.5725,  0.5290,  0.2465,  ..., -0.5597,  0.6190, -0.0398]]])


## 2.3 更新嵌入层

In [14]:
gpt.tok_emb

Embedding(50257, 768)

In [15]:
num_tokens, emb_size = gpt.tok_emb.weight.shape
new_num_tokens = num_tokens + 2

# 创建新的emb层
new_embedding = torch.nn.Embedding(new_num_tokens, emb_size)

# 初始化新的emb层
new_embedding.weight.data[:num_tokens] = gpt.tok_emb.weight.data

# 更新模型的emb层
gpt.tok_emb = new_embedding

print(gpt.tok_emb)

Embedding(50259, 768)


## 2.4 更新输出层

In [16]:
gpt.out_head

Linear(in_features=768, out_features=50257, bias=False)

In [18]:
original_out_features, original_in_features = gpt.out_head.weight.shape

# 更新输出层的维度
new_out_features = original_out_features + 2

# 创建新的输出层
new_linear = torch.nn.Linear(original_in_features, new_out_features)

# 初始化新的输出层
with torch.no_grad():
    new_linear.weight[:original_out_features] = gpt.out_head.weight
    if gpt.out_head.bias is not None:
        new_linear.bias[:original_out_features] = gpt.out_head.bias

# 替换输出层
gpt.out_head = new_linear

print(gpt.out_head)

Linear(in_features=768, out_features=50259, bias=True)


In [20]:
with torch.no_grad():
    output = gpt(torch.tensor([original_token_ids]))
print(output)

tensor([[[ 0.5592, -0.6727,  0.0343,  ...,  0.0925, -0.2522, -0.6455],
         [ 0.4088,  0.1081,  0.0213,  ..., -0.2637, -0.3821, -0.3910],
         [-0.0029,  0.3040, -1.1983,  ..., -0.1709, -1.3955, -0.1974],
         ...,
         [ 0.9707, -0.1148, -0.7700,  ...,  0.2833, -1.1440, -0.3543],
         [ 0.0316,  0.1743, -0.3673,  ..., -0.1009, -0.3407, -0.1852],
         [ 0.5609,  0.5079,  0.2249,  ..., -0.0556, -1.3220, -0.6972]]])


In [21]:
# 测试更新后的模型
with torch.no_grad():
    output = gpt(torch.tensor([new_token_ids]))
print(output)

tensor([[[ 0.5592, -0.6727,  0.0343,  ...,  0.0925, -0.2522, -0.6455],
         [ 0.4088,  0.1081,  0.0213,  ..., -0.2637, -0.3821, -0.3910],
         [-0.0029,  0.3040, -1.1983,  ..., -0.1709, -1.3955, -0.1974],
         ...,
         [ 0.3398, -0.1918, -1.0059,  ...,  0.3005, -0.3921, -0.5502],
         [ 0.5133, -0.1249, -0.1879,  ..., -0.5016, -0.0565, -0.5247],
         [ 0.3893, -0.0723,  0.0928,  ...,  0.6912, -0.6672, -0.6765]]])


## 关于权重共享

In [22]:
gpt.out_head.weight = gpt.tok_emb.weight

In [24]:
with torch.no_grad():
    output = gpt(torch.tensor([new_token_ids]))
output

tensor([[[-66.2201, -22.1279,  63.5223,  ...,  25.7582, -50.2610, -13.6285],
         [-11.4081,  45.6967, -16.4285,  ...,   5.6111,   4.0119,  28.4471],
         [  1.1481,   1.0497,  11.1086,  ...,  46.5169, -36.9159, -42.0516],
         ...,
         [-33.4828,   9.1330,  48.9171,  ...,  -7.4979,   6.5314, -33.6417],
         [-29.0724,  16.2150,  13.3827,  ...,  43.9154,  41.5060,   7.2719],
         [  1.7947,  -7.3529,   6.0034,  ..., 441.6958, -24.7384, -50.4073]]])