# 比较不同字对编码（BPE）的实现和效果

In [23]:
import numpy as np


In [28]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch02", "02_bonus_bytepair-encoder")
sys.path.append(file_path)


### 使用tittoken的BPE

In [1]:
from importlib.metadata import version

print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [2]:
import tiktoken

tik_tokenizer = tiktoken.get_encoding("gpt2")

text = "Hello, world. Is this-- a test?"



In [11]:
integers = tik_tokenizer.encode(
    text, 
    allowed_special={"<|endofttext|>"}
)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [36]:
for i in integers:
    strings = tik_tokenizer.decode([i])
    print(f"{i}:\t '{strings}'")

15496:	 'Hello'
11:	 ','
995:	 ' world'
13:	 '.'
1148:	 ' Is'
428:	 ' this'
438:	 '--'
257:	 ' a'
1332:	 ' test'
30:	 '?'


In [6]:
print(tik_tokenizer.n_vocab)

50257


### 使用gpt-2中使用的原始BPE实现

In [29]:
from bpe_openai_gpt2 import get_encoder, download_vocab

In [30]:
download_vocab()

Fetching encoder.json: 1.04Mit [00:01, 696kit/s]                                                    
Fetching vocab.bpe: 457kit [00:00, 498kit/s]                                                        


In [31]:
orig_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

In [32]:
integers = orig_tokenizer.encode(text)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [41]:
strings = orig_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


In [35]:
for i in integers:
    strings = tik_tokenizer.decode([i])
    print(f"{i}:\t '{strings}'")

15496:	 'Hello'
11:	 ','
995:	 ' world'
13:	 '.'
1148:	 ' Is'
428:	 ' this'
438:	 '--'
257:	 ' a'
1332:	 ' test'
30:	 '?'


### 通过hugging face transformers使用BPE

In [38]:
import transformers

transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.49.0'

In [39]:
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [42]:
hf_tokenizer(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

## 快速性能测试

In [43]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [44]:
%timeit orig_tokenizer.encode(raw_text)

4.07 ms ± 49.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
%timeit tik_tokenizer.encode(raw_text)

1.11 ms ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [47]:
%timeit hf_tokenizer(raw_text)["input_ids"]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors


11.3 ms ± 100 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [48]:
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]

11.1 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
