# Building Tokenizer

In [15]:
test_str = "मराठीमध्ये टाइप करा 🫣 Tejas Kalsait"
print(len(test_str))

36


In [30]:
len(list(test_str.encode("utf-8")))

73

In [29]:
print("म".encode("utf-8"))
chr(0)

b'\xe0\xa4\xae'


'\x00'

## Byte Pairing Algorithm

In [48]:
# Input string
test_str = "The quick brown 🦊 (fox) jumps over 13 lazy 犬 (dogs). In a serene village by the Ἀγορά (Agora), Мария (Maria) and अर्जुन (Arjun) discuss the intricacies of Πυθαγόρειο Θεώρημα (Pythagorean Theorem). Meanwhile, under a 蓝色 (blue) sky, 孫悟空 (Sun Wukong) embarks on his journey towards the west, reciting poems in فارسی (Farsi) and reflecting on the Zen principle of 無 (Mu)."
len(test_str)

367

In [58]:
# UTF-8 encoding and converting to integer          # By default the vocab size becomes 255 (0->maps to an element, 255-> maps to another element )
tokens = test_str.encode("utf-8")
toekens = list(map(int, tokens))
len(tokens)

429

In [62]:
# Returns a historgram dict of counts of pairs of tokens
# k -> (int1, int2)
# v -> count int
def count_token_pairs(unicode):
    pairs_count = {}
    for i1, i2 in zip(unicode[0:], unicode[1:]):
        pairs_count[(i1, i2)] = pairs_count.get((i1, i2), 0) + 1
    return pairs_count
    #return sorted(((v, k) for k, v in pairs_count.items()), reverse = True)

token_pairs_dict = count_token_pairs(tokens)
token_pairs_dict

{(84, 104): 2,
 (104, 101): 6,
 (101, 32): 8,
 (32, 113): 1,
 (113, 117): 1,
 (117, 105): 1,
 (105, 99): 2,
 (99, 107): 1,
 (107, 32): 1,
 (32, 98): 2,
 (98, 114): 1,
 (114, 111): 1,
 (111, 119): 2,
 (119, 110): 1,
 (110, 32): 8,
 (32, 240): 1,
 (240, 159): 1,
 (159, 166): 1,
 (166, 138): 1,
 (138, 32): 1,
 (32, 40): 10,
 (40, 102): 1,
 (102, 111): 1,
 (111, 120): 1,
 (120, 41): 1,
 (41, 32): 6,
 (32, 106): 2,
 (106, 117): 2,
 (117, 109): 1,
 (109, 112): 1,
 (112, 115): 1,
 (115, 32): 7,
 (32, 111): 5,
 (111, 118): 1,
 (118, 101): 1,
 (101, 114): 3,
 (114, 32): 2,
 (32, 49): 1,
 (49, 51): 1,
 (51, 32): 1,
 (32, 108): 1,
 (108, 97): 2,
 (97, 122): 1,
 (122, 121): 1,
 (121, 32): 3,
 (32, 231): 2,
 (231, 138): 1,
 (138, 172): 1,
 (172, 32): 2,
 (40, 100): 1,
 (100, 111): 1,
 (111, 103): 1,
 (103, 115): 1,
 (115, 41): 1,
 (41, 46): 3,
 (46, 32): 2,
 (32, 73): 1,
 (73, 110): 1,
 (32, 97): 4,
 (97, 32): 2,
 (32, 115): 2,
 (115, 101): 1,
 (114, 101): 5,
 (101, 110): 2,
 (110, 101): 2,
 (32, 1

In [65]:
top_pair = max(token_pairs_dict, key = token_pairs_dict.get)
top_pair

(32, 40)

In [69]:
def merge(tokens: list, pair:tuple, id:int):
    '''
    return -> tokens list where pair is replaced with id
    '''
    merged_list = []
    i = 0
    while i < len(tokens):
        if tokens[i] == pair[0] and tokens[i+1] == pair[1] and i < len(tokens) - 1:
            # Merge
            merged_list.append(id)
            i += 2
        else:
            merged_list.append(tokens[i])
            i += 1

    return merged_list

toy = merge([1, 2, 3, 3, 5, 3, 3, 4], (3, 3), -1)
#print(toy)

In [78]:
# Making tokenizer

# Input string and encoding to utf-8 list
test_str = "The quick brown 🦊 (fox) jumps over 13 lazy 犬 (dogs). In a serene village by the Ἀγορά (Agora), Мария (Maria) and अर्जुन (Arjun) discuss the intricacies of Πυθαγόρειο Θεώρημα (Pythagorean Theorem). Meanwhile, under a 蓝色 (blue) sky, 孫悟空 (Sun Wukong) embarks on his journey towards the west, reciting poems in فارسی (Farsi) and reflecting on the Zen principle of 無 (Mu)."
tokens = test_str.encode("utf-8")
tokens = list(map(int, tokens))

vocab_size = 266
num_merges = vocab_size - 256

merges = {}

for i in range(num_merges):
    tokens_pairs = count_token_pairs(tokens)
    top_pair = max(tokens_pairs, key = tokens_pairs.get)
    id = 256 + i
    print(f"merging {top_pair} into a new token {id}")
    tokens = merge(tokens, top_pair, id)
    merges[top_pair] = id

merging (32, 40) into a new token 256
merging (101, 32) into a new token 257
merging (110, 32) into a new token 258
merging (115, 32) into a new token 259
merging (41, 32) into a new token 260
merging (104, 257) into a new token 261
merging (114, 101) into a new token 262
merging (116, 261) into a new token 263
merging (44, 32) into a new token 264
merging (97, 114) into a new token 265


In [79]:
# Compression Ratio
429 / len(tokens)

1.1657608695652173

In [90]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

def encode(text):
  # given a string, return list of integers (the tokens)
  tokens = list(text.encode("utf-8"))
  while len(tokens) >= 2:
    stats = count_token_pairs(tokens)
    pair = min(stats, key=lambda p: merges.get(p, float("inf")))
    if pair not in merges:
      break # nothing else can be merged
    idx = merges[pair]
    tokens = merge(tokens, pair, idx)
  return tokens

# Given a list of tokens in range [0, vocab_size], return the text
def decode(token_list):
    tokens = b"".join(vocab[idx] for idx in token_list)
    text = tokens.decode("utf-8", errors = 'replace')
    return text


In [95]:
print(decode(encode("13 lazy 犬 (dogs)")))

13 lazy 犬 (dogs)


# REGEX