In [1]:
from collections import Counter, deque
import os, json
from vbtokenizer import SimpleTokenizer
import regex as re

In [2]:
str_to_uni = {}
for letter in "abcdefghijklmnopqrstuvwxyzαβγδεζηθ":
    str_to_uni[letter] = ord(letter)

In [3]:
text = """Most devs love jumping straight into code. I used to do that too. You open your editor, spin up a Next.js app, and you’re off.
Feels good — until it doesn’t. Because after a few weeks, you realize your files are everywhere. APIs live in one folder,
components in another, and no one knows what’s going on. That’s when I started doing something different. I call it Vibe Coding."""
tokens = text.encode("utf-8")

print("-"*10)
print("Original text length =",len(text))
print("-"*10)
print("Tokenized length =",len(tokens))
print("-"*10)

----------
Original text length = 377
----------
Tokenized length = 387
----------


In [4]:
def get_combinations(tokens):
    counts = {}
    for pair in zip(tokens, tokens[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return dict(sorted(counts.items(), key=lambda x: -x[1]))

In [5]:
counts = get_combinations(tokens)
max(counts, key=counts.get)

(115, 32)

In [6]:
def merge(tokens, pair, index):
    new_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i+1] == pair[1]:
            new_tokens.append(index)
            i+=2
        else:
            new_tokens.append(tokens[i])
            i+=1
    return new_tokens

In [7]:
merge([1,2,2,3,4,5], (1,2), 6)

[6, 2, 3, 4, 5]

In [8]:
with open('./shakespeare.txt', 'r') as file:
    shakespeare_text = file.read()
tokens = shakespeare_text.encode("utf-8")
tokens = list(map(int, tokens))

In [9]:
num_merges = 2
ids = list(tokens)

merges = {}
for i in range(num_merges):
    combinations = get_combinations(ids)
    top_pair = max(combinations, key=combinations.get)
    idx = 256 + i
    print(f"Merging pair f{top_pair} into new token {idx}")
    ids = merge(ids,top_pair,idx)
    merges[top_pair] = idx

Merging pair f(32, 32) into new token 256
Merging pair f(256, 256) into new token 257


In [10]:
print(f"Initial tokens length = {len(tokens)}")
print(f"New tokens length = {len(ids)}")
print(f"Compression ratio = {len(tokens)/len(ids):.2f} X")

Initial tokens length = 5436475
New tokens length = 5085353
Compression ratio = 1.07 X


In [11]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1] 
    
def decode(input: list[int]) -> str:
    tokens = b"".join(vocab[idx] for idx in input)
    text = tokens.decode("utf-8", errors="replace")
    return text
    
def encode(input: str) -> list[int]:
    tokens = list(input.encode("utf-8"))
    for (token_1, token_2), value in merges.items():
        tokens = merge(tokens, (token_1, token_2), value)
    
    return tokens 
    

In [12]:
encoded_tokens = encode("This is a test")
decoded_tokens = decode(encoded_tokens)
print(decoded_tokens)

This is a test


In [13]:
with open("./gpt2_merges.txt", "r") as bpe:
    bpe_data = bpe.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]

In [14]:
with open("./gpt2_vocab.json", "r") as vocab:
    encoder = json.load(vocab)
encoder["<|endoftext|>"]

50256

You can find more info regarding gpt-2 and other variants in [Tiktoken Openai Tokenizers](https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py)

In [15]:
tokenizer_regex = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s++$|\s*[\r\n]|\s+(?!\S)|\s"""

In [16]:
re.findall(tokenizer_regex, "Hello world, today we are going to learn about OpenAI gpt tokenizers")

['Hello',
 ' world',
 ',',
 ' today',
 ' we',
 ' are',
 ' going',
 ' to',
 ' learn',
 ' about',
 ' OpenAI',
 ' gpt',
 ' tokenizers']

## Base Tokenizer

Do the same as before but more cleanly using the class we built BaseTokenizer

In [17]:
tokenizer = SimpleTokenizer()

In [18]:
tokenizer.train(shakespeare_text, 260)

Merging byte pairs: 100%|██████████| 4/4 [00:09<00:00,  2.27s/it]


In [19]:
initial_sentence = "Hello there Vasilis"
print(f"Provided sentence: {initial_sentence}")
tokenized_input = tokenizer.encode(initial_sentence)
print("Tokenized sentence =",tokenized_input)
decoded_input = tokenizer.decode(tokenized_input)
print("Decoded sentence =",decoded_input)

Provided sentence: Hello there Vasilis
Tokenized sentence = [72, 101, 108, 108, 111, 32, 259, 101, 114, 258, 86, 97, 115, 105, 108, 105, 115]
Decoded sentence = Hello there Vasilis


In [None]:
for special_token in ["<vm_start>", "<vm_user>","<vm_assistant>","<vm_end>"]:
    tokenizer.add_special_token(special_token)

In [21]:
tokenizer.add_special_token("<vm_start>")

[38;5;1m✘ Skipping: The `<vm_start>` special token already exists[0m


In [22]:
tokenizer.save("my_first_tokenizer")

In [23]:
tokenizer2 = SimpleTokenizer()
tokenizer2.load("my_first_tokenizer")

In [24]:
tokenizer.merges

{(32, 32): 256, (256, 256): 257, (101, 32): 258, (116, 104): 259}

In [25]:
tokenizer2.merges

{(32, 32): 256, (256, 256): 257, (101, 32): 258, (116, 104): 259}

In [26]:
tokenized_input_2 = tokenizer2.encode(initial_sentence)
print("Tokenized sentence with tokenizer_2 =",tokenized_input_2)
print("Tokenized list match with saved tokenizer", tokenized_input_2 == tokenized_input)
decoded_input_2 = tokenizer2.decode(tokenized_input_2)
print("Decoded sentence with tokenizer_2 =",decoded_input_2)

Tokenized sentence with tokenizer_2 = [72, 101, 108, 108, 111, 32, 259, 101, 114, 258, 86, 97, 115, 105, 108, 105, 115]
Tokenized list match with saved tokenizer True
Decoded sentence with tokenizer_2 = Hello there Vasilis


In [28]:
tokens = tokenizer2.encode("<vm_start> <vm_user>Does the special tokens work? <vm_assistant>Yes they do work! <vm_end>")
print(f"Encoded decoded sentence result = {tokenizer2.decode(tokens)}")

Encoded decoded sentence result = <vm_start> <vm_user>Does the special tokens work? <vm_assistant>Yes they do work! <vm_end>
