<a href="https://colab.research.google.com/github/Shubbair/GPT4-Tokenizer/blob/main/GPT4Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Use *Shakespear Poem*

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-03-05 12:51:50--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-03-05 12:51:51 (20.2 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [40]:
def merge(ids, pair, idx):
    newids = []
    i = 0
    while i < len(ids):
      if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
        newids.append(idx)
        i += 2
      else:
        newids.append(ids[i])
        i += 1
    return newids

def get_stats(ids):
    counts = {}
    for pair in zip(ids,ids[1:]):
        counts[pair] = counts.get(pair,0) + 1 # check if there are exist pair if not add 0 then +1
    return counts

class BasicTokenizer:

    def __init__(self):
        self.vocabs:int = 0
        self.merges:dict = {}

    def train(self, text, vocab_size, verbose=False):
        assert vocab_size > 256
        num_merges = vocab_size - 256
        tokens = list(text.encode("utf-8"))
        ids = list(tokens)
        merges = {}
        for i in range(num_merges):
          stats = get_stats(ids)
          pair = max(stats, key=stats.get)
          idx = 256 + i
          if verbose :
             print(f"merging {pair} into a new token {idx}")

          ids = merge(ids, pair, idx)
          merges[pair] = idx

        self.merges = merges
        self.vocabs = vocab_size

        print("tokens length:", len(tokens))
        print("ids length:", len(ids))
        print(f"compression ratio: {len(tokens) / len(ids):.2f}X")

    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        while True:
            stats = get_stats(tokens)
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break # nothing else can be merged
            idx = self.merges[pair]
            tokens = merge(tokens, pair, idx)
        return tokens

    def decode(self, ids):
        vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in self.merges.items():
            vocab[idx] = vocab[p0] + vocab[p1]
        tokens = b"".join(vocab[idx] for idx in ids) # merge sequence of bytes
        text = tokens.decode("utf-8", errors="replace")
        return text

In [41]:
tokenizer = BasicTokenizer()
text = "aaabdaaabac"
tokenizer.train(text, 256 + 3)

tokens length: 11
ids length: 5
compression ratio: 2.20X


In [42]:
ids = tokenizer.encode(text)

In [43]:
assert ids == [258, 100, 258, 97, 99]

In [44]:
assert tokenizer.decode(tokenizer.encode(text)) == text