In [1]:
from importlib.metadata import version

print(f"tiktoken version:{version('tiktoken')}")

tiktoken version:0.7.0


In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

text = "Hello, world. Is this-- a test?"

In [3]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [4]:
strings = tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


In [5]:
print(tokenizer.n_vocab)

50257


# 使用GPT-2中使用的原始BPE实现

In [6]:
from bpe_openai_gpt2 import get_encoder, download_vocab

In [9]:
download_vocab()

Fetching encoder.json: 1.04Mit [00:01, 804kit/s]                                                    
Fetching vocab.bpe: 457kit [00:00, 499kit/s]                                                        


In [10]:
origin_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

In [11]:
integers = origin_tokenizer.encode(text)

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [13]:
strings = origin_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


# 通过Hugging Face的Transformers使用BPE

In [14]:
import transformers

transformers.__version__

'4.41.2'

In [16]:
from transformers import GPT2Tokenizer

hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [17]:
hf_tokenizer(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

In [18]:
with open('../../../Data/the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [19]:
%timeit origin_tokenizer.encode(raw_text)

12.5 ms ± 440 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
%timeit tokenizer.encode(raw_text)

2.9 ms ± 64 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [23]:
%timeit hf_tokenizer.encode(raw_text)

22.8 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)['input_ids']

26.3 ms ± 3.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
