In [2]:
# pip install tiktoken

In [3]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.5.2


In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [6]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


two noteworthy observations
- <|endoftext|> token is assigned a relatively large token ID, namely, 50256
  - GPT-2, GPT-3, and ChatGPT has a total vocabulary size of 50,257
- BPE tokenizer above encodes and decodes unknown words, such as "someunknownPlace" correctly

In [7]:
tokenizer.encode("Akwirw ier")

[33901, 86, 343, 86, 220, 959]

In [8]:
print(tokenizer.decode([33901]))
print(tokenizer.decode([86]))
print(tokenizer.decode([343]))
print(tokenizer.decode([86]))
print(tokenizer.decode([220]))
print(tokenizer.decode([959]))

Ak
w
ir
w
 
ier


In [9]:
tokenizer.decode(tokenizer.encode("Akwirw ier"))

'Akwirw ier'

BPE
- it builds its vocabulary by iteratively merging frequent characters into subwords and frequent subwords into words.
- For example, BPE starts with adding all individual single characters to its vocabulary ("a", "b", ...).
  - In the next stage, it merges character combinations that frequently occur together into subwords. For example, "d" and "e" may be merged into the subword "de," which is common in many English words like "define", "depend", "made", and "hidden".
- The merges are determined by a frequency cutoff.

# Data sampling with a sliding window

In [10]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [11]:
enc_sample = enc_text[50:]

In [12]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1 : context_size + 1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [13]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [14]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a
