# ByteTok: _A simple tokenizer_


In [42]:
text = """
    Beneath the aurora, na√Øve philosophers sip caf√© au lait while debating
    √ºber ideas of entropy and fate. ŒïŒªŒªŒ∑ŒΩŒπŒ∫Œ≠œÇ ŒªŒ≠ŒæŒµŒπœÇ mingle with –†—É—Å—Å–∫–∏–µ —Ñ—Ä–∞–∑—ã, and
    ÿßŸÑÿπÿ±ÿ®Ÿäÿ© ÿ£ÿ≠ÿ±ŸÅ flow beside ‡§π‡§ø‡§®‡•ç‡§¶‡•Ä ‡§Ö‡§ï‡•ç‡§∑‡§∞. Â±±Â∑ù echo in kanji, while
    ÌïúÍ∏Ä rhythms pulse softly. Accents‚Äî√©, √±, √º, √•‚Äîornament thought.
    Mathematical whispers ‚àû, ‚àë, and ‚àö hover near poetry.
    A r√©sum√© rests on a table carved with runes, co√∂peration etched in stone.
    Symbols like Œª, Œ©, and œÄ converse quietly. Even the wind hums in IPA tones.
    Meaning survives borders, scripts, and centuries, resilient, diverse,
    and profoundly human in form.
"""

print(f"{len(text)=}")

len(text)=616


## UTF encoding

A Unicode code point is a single numeric value in the Unicode range (e.g., U+0041 for A, U+1F600 for üòÄ).

The byte sequence is an encoding of that code point (e.g., UTF‚Äë8 uses 1‚Äì4 bytes to encode the single code point).

Watch [this video](https://youtu.be/vpSkBV5vydg) for an explanation on utf-8.


### Text as a sequence of raw bytes


In [43]:
# retrieve text as raw bytes
tokens = text.encode("utf-8")
print(f"{tokens=}")
print(f"\n{len(tokens)=}")
print(f"\n{type(tokens)=}")

tokens=b'\n    Beneath the aurora, na\xc3\xafve philosophers sip caf\xc3\xa9 au lait while debating\n    \xc3\xbcber ideas of entropy and fate. \xce\x95\xce\xbb\xce\xbb\xce\xb7\xce\xbd\xce\xb9\xce\xba\xce\xad\xcf\x82 \xce\xbb\xce\xad\xce\xbe\xce\xb5\xce\xb9\xcf\x82 mingle with \xd0\xa0\xd1\x83\xd1\x81\xd1\x81\xd0\xba\xd0\xb8\xd0\xb5 \xd1\x84\xd1\x80\xd0\xb0\xd0\xb7\xd1\x8b, and\n    \xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a\xd8\xa9 \xd8\xa3\xd8\xad\xd8\xb1\xd9\x81 flow beside \xe0\xa4\xb9\xe0\xa4\xbf\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xa6\xe0\xa5\x80 \xe0\xa4\x85\xe0\xa4\x95\xe0\xa5\x8d\xe0\xa4\xb7\xe0\xa4\xb0. \xe5\xb1\xb1\xe5\xb7\x9d echo in kanji, while\n    \xed\x95\x9c\xea\xb8\x80 rhythms pulse softly. Accents\xe2\x80\x94\xc3\xa9, \xc3\xb1, \xc3\xbc, \xc3\xa5\xe2\x80\x94ornament thought.\n    Mathematical whispers \xe2\x88\x9e, \xe2\x88\x91, and \xe2\x88\x9a hover near poetry.\n    A r\xc3\xa9sum\xc3\xa9 rests on a table carved with runes, co\xc3\xb6peration etched in stone.\n

### Convert each byte to integer


In [44]:
tokens = list(tokens)

print(f"{tokens=}")
print(f"\n{len(tokens)=}")
print(f"\n{type(tokens)=}")

tokens=[10, 32, 32, 32, 32, 66, 101, 110, 101, 97, 116, 104, 32, 116, 104, 101, 32, 97, 117, 114, 111, 114, 97, 44, 32, 110, 97, 195, 175, 118, 101, 32, 112, 104, 105, 108, 111, 115, 111, 112, 104, 101, 114, 115, 32, 115, 105, 112, 32, 99, 97, 102, 195, 169, 32, 97, 117, 32, 108, 97, 105, 116, 32, 119, 104, 105, 108, 101, 32, 100, 101, 98, 97, 116, 105, 110, 103, 10, 32, 32, 32, 32, 195, 188, 98, 101, 114, 32, 105, 100, 101, 97, 115, 32, 111, 102, 32, 101, 110, 116, 114, 111, 112, 121, 32, 97, 110, 100, 32, 102, 97, 116, 101, 46, 32, 206, 149, 206, 187, 206, 187, 206, 183, 206, 189, 206, 185, 206, 186, 206, 173, 207, 130, 32, 206, 187, 206, 173, 206, 190, 206, 181, 206, 185, 207, 130, 32, 109, 105, 110, 103, 108, 101, 32, 119, 105, 116, 104, 32, 208, 160, 209, 131, 209, 129, 209, 129, 208, 186, 208, 184, 208, 181, 32, 209, 132, 209, 128, 208, 176, 208, 183, 209, 139, 44, 32, 97, 110, 100, 10, 32, 32, 32, 32, 216, 167, 217, 132, 216, 185, 216, 177, 216, 168, 217, 138, 216, 169, 32, 216,

## Byte Pair Encoding


### Get frequency of all byte pairs


In [None]:
type BpFreqStore = list[tuple[tuple[int, int], int]]


def get_bps(toks: list[int]) -> BpFreqStore:
    pairs = []
    ranks = {}

    for a, b in zip(toks, toks[1:]):
        pairs.append((a, b))

    for pair in pairs:
        ranks[pair] = ranks.get(pair, 0) + 1

    # frequency of pairs
    return sorted(ranks.items(), key=lambda x: x[1], reverse=True)


In [46]:
pairs = get_bps(tokens)
pairs

[((32, 32), 27),
 ((44, 32), 15),
 ((101, 32), 10),
 ((10, 32), 9),
 ((32, 97), 9),
 ((97, 110), 9),
 ((116, 104), 8),
 ((101, 114), 8),
 ((115, 32), 8),
 ((105, 110), 8),
 ((110, 100), 8),
 ((100, 32), 8),
 ((224, 164), 8),
 ((110, 32), 8),
 ((101, 110), 7),
 ((118, 101), 7),
 ((101, 115), 7),
 ((97, 116), 6),
 ((32, 119), 6),
 ((110, 101), 5),
 ((32, 116), 5),
 ((104, 101), 5),
 ((114, 115), 5),
 ((32, 115), 5),
 ((32, 99), 5),
 ((32, 105), 5),
 ((110, 116), 5),
 ((32, 114), 5),
 ((46, 10), 5),
 ((111, 110), 5),
 ((101, 97), 4),
 ((111, 114), 4),
 ((32, 112), 4),
 ((104, 105), 4),
 ((105, 108), 4),
 ((195, 169), 4),
 ((108, 101), 4),
 ((100, 101), 4),
 ((32, 195), 4),
 ((46, 32), 4),
 ((32, 206), 4),
 ((206, 187), 4),
 ((115, 44), 4),
 ((104, 32), 3),
 ((117, 114), 3),
 ((114, 111), 3),
 ((115, 105), 3),
 ((99, 97), 3),
 ((169, 32), 3),
 ((105, 116), 3),
 ((119, 104), 3),
 ((116, 105), 3),
 ((110, 103), 3),
 ((114, 32), 3),
 ((111, 102), 3),
 ((32, 101), 3),
 ((32, 102), 3),
 ((187, 