In [8]:
from collections import Counter, deque
import os, json
import regex as re

In [2]:
str_to_uni = {}
for letter in "abcdefghijklmnopqrstuvwxyzαβγδεζηθ":
    str_to_uni[letter] = ord(letter)

In [3]:
text = """Most devs love jumping straight into code. I used to do that too. You open your editor, spin up a Next.js app, and you’re off.
Feels good — until it doesn’t. Because after a few weeks, you realize your files are everywhere. APIs live in one folder,
components in another, and no one knows what’s going on. That’s when I started doing something different. I call it Vibe Coding."""
tokens = text.encode("utf-8")

print("-"*10)
print("Original text length =",len(text))
print("-"*10)
print("Tokenized length =",len(tokens))
print("-"*10)

----------
Original text length = 377
----------
Tokenized length = 387
----------


In [4]:
def get_combinations(tokens):
    counts = {}
    for pair in zip(tokens, tokens[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return dict(sorted(counts.items(), key=lambda x: -x[1]))

In [5]:
counts = get_combinations(tokens)
max(counts, key=counts.get)

(115, 32)

In [6]:
def merge(tokens, pair, index):
    new_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i+1] == pair[1]:
            new_tokens.append(index)
            i+=2
        else:
            new_tokens.append(tokens[i])
            i+=1
    return new_tokens

In [7]:
merge([1,2,2,3,4,5], (1,2), 6)

[6, 2, 3, 4, 5]

In [14]:
with open('./shakespeare.txt', 'r') as file:
    text = file.read()
tokens = text.encode("utf-8")
tokens = list(map(int, tokens))

In [16]:
num_merges = 100
ids = list(tokens)

merges = {}
for i in range(num_merges):
    combinations = get_combinations(ids)
    top_pair = max(combinations, key=combinations.get)
    idx = 256 + i
    print(f"Merging pair f{top_pair} into new token {idx}")
    ids = merge(ids,top_pair,idx)
    merges[top_pair] = idx

Merging pair f(32, 32) into new token 256
Merging pair f(256, 256) into new token 257
Merging pair f(101, 32) into new token 258
Merging pair f(116, 104) into new token 259
Merging pair f(10, 257) into new token 260
Merging pair f(116, 32) into new token 261
Merging pair f(115, 32) into new token 262
Merging pair f(44, 32) into new token 263
Merging pair f(100, 32) into new token 264
Merging pair f(111, 117) into new token 265
Merging pair f(101, 114) into new token 266
Merging pair f(105, 110) into new token 267
Merging pair f(97, 110) into new token 268
Merging pair f(121, 32) into new token 269
Merging pair f(46, 32) into new token 270
Merging pair f(111, 114) into new token 271
Merging pair f(10, 256) into new token 272
Merging pair f(111, 32) into new token 273
Merging pair f(101, 110) into new token 274
Merging pair f(97, 114) into new token 275
Merging pair f(32, 259) into new token 276
Merging pair f(108, 108) into new token 277
Merging pair f(111, 110) into new token 278
Mergi

In [24]:
print(f"Initial tokens length = {len(tokens)}")
print(f"New tokens length = {len(ids)}")
print(f"Compression ratio = {len(tokens)/len(ids):.2f} X")

Initial tokens length = 5436475
New tokens length = 3175423
Compression ratio = 1.71 X


In [59]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1] 
    
def decode(input: list[int]) -> str:
    tokens = b"".join(vocab[idx] for idx in input)
    text = tokens.decode("utf-8", errors="replace")
    return text
    
def encode(input: str) -> list[int]:
    tokens = list(input.encode("utf-8"))
    for (token_1, token_2), value in merges.items():
        tokens = merge(tokens, (token_1, token_2), value)
    
    return tokens 
    

In [None]:
encoded_tokens = encode("This is a test")
decoded_tokens = decode(encoded_tokens)
print(decoded_tokens)

  This is a test  


In [None]:
with open("./gpt2_vocab.json", "r") as vocab:
    encoder = json.load(vocab)
encoder["<|endoftext|>"]

You can find more info regarding gpt-2 and other variants in [Tiktoken Openai Tokenizers](https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py)

In [7]:
tokenizer_regex = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s"""

In [10]:
re.findall(tokenizer_regex, "Hello world, today we are going to learn about OpenAI gpt tokenizers")

['Hello',
 ' world',
 ',',
 ' today',
 ' we',
 ' are',
 ' going',
 ' to',
 ' learn',
 ' about',
 ' OpenAI',
 ' gpt',
 ' tokenizers']