In [10]:
class BasicTokenizer():
  def __init__(self):
    self.merges = {}
    self.vocab = {}

  def get_stat(self, tokens):
    stat = {} #(pair):number
    for pair in zip(tokens, tokens[1:]):
      stat[pair] = stat.get(pair, 0) + 1
    return stat

  def merge(self, tokens, pair, idx):
    new_ids = []
    i = 0
    while i < len(tokens):
      if tokens[i]==pair[0] and tokens[i+1]==pair[1] and i<len(tokens)-1:
        new_ids.append(idx)
        i += 2
      else:
        new_ids.append(tokens[i])
        i += 1
    return new_ids


  def train(self, text, vocab_size, verbose=False):
    tokens = text.encode("utf-8")
    tokens = list(map(int, tokens))
    self.merges = {}

    num_merge = vocab_size - 256

    for i in range(num_merge):
      stat = self.get_stat(tokens)
      pair = max(stat, key=stat.get)
      idx = i + 256
      tokens = self.merge(tokens, pair, idx)
      self.merges[pair] = idx

      if verbose:
        print(f"merge {i+1}/{num_merge}: {pair} -> {idx}")

    return self.merges

  def encode(self, text):
    tokens = text.encode("utf-8")
    tokens = list(map(int, tokens))

    while len(tokens) >= 2:
        stat = self.get_stat(tokens)
        pair = min(stat, key=lambda p: self.merges.get(p, float('inf')))
        if pair not in self.merges:
            break
        tokens = self.merge(tokens, pair, self.merges[pair])

    return tokens

  def decode(self, ids):
    vocab = {idx:bytes([idx]) for idx in range(256)}
    for (p0, p1), idx in self.merges.item():
      vocab[idx] = vocab[p0] + vocab[p1]
    tokens = b"".join(vocab[idx] for idx in ids)
    text = tokens.decode("utf-8", errors='replace')
    return text

In [11]:
text = open("../tests/taylorswift.txt", "r", encoding="utf-8").read()

baT = BasicTokenizer()

In [12]:
baT.train(text, 512, verbose=True)


merge 1/256: (101, 32) -> 256
merge 2/256: (44, 32) -> 257
merge 3/256: (100, 32) -> 258
merge 4/256: (46, 32) -> 259
merge 5/256: (114, 32) -> 260
merge 6/256: (50, 48) -> 261
merge 7/256: (115, 32) -> 262
merge 8/256: (105, 110) -> 263
merge 9/256: (111, 110) -> 264
merge 10/256: (114, 105) -> 265
merge 11/256: (116, 32) -> 266
merge 12/256: (116, 104) -> 267
merge 13/256: (101, 258) -> 268
merge 14/256: (257, 261) -> 269
merge 15/256: (97, 110) -> 270
merge 16/256: (97, 114) -> 271
merge 17/256: (101, 260) -> 272
merge 18/256: (121, 32) -> 273
merge 19/256: (97, 108) -> 274
merge 20/256: (267, 256) -> 275
merge 21/256: (118, 268) -> 276
merge 22/256: (119, 105) -> 277
merge 23/256: (101, 114) -> 278
merge 24/256: (264, 32) -> 279
merge 25/256: (277, 102) -> 280
merge 26/256: (82, 101) -> 281
merge 27/256: (83, 280) -> 282
merge 28/256: (111, 260) -> 283
merge 29/256: (99, 104) -> 284
merge 30/256: (269, 49) -> 285
merge 31/256: (111, 109) -> 286
merge 32/256: (98, 272) -> 287
merge 

{(101, 32): 256,
 (44, 32): 257,
 (100, 32): 258,
 (46, 32): 259,
 (114, 32): 260,
 (50, 48): 261,
 (115, 32): 262,
 (105, 110): 263,
 (111, 110): 264,
 (114, 105): 265,
 (116, 32): 266,
 (116, 104): 267,
 (101, 258): 268,
 (257, 261): 269,
 (97, 110): 270,
 (97, 114): 271,
 (101, 260): 272,
 (121, 32): 273,
 (97, 108): 274,
 (267, 256): 275,
 (118, 268): 276,
 (119, 105): 277,
 (101, 114): 278,
 (264, 32): 279,
 (277, 102): 280,
 (82, 101): 281,
 (83, 280): 282,
 (111, 260): 283,
 (99, 104): 284,
 (269, 49): 285,
 (111, 109): 286,
 (98, 272): 287,
 (32, 275): 288,
 (97, 121): 289,
 (101, 110): 290,
 (111, 114): 291,
 (274, 32): 292,
 (101, 109): 293,
 (46, 10): 294,
 (265, 101): 295,
 (263, 103): 296,
 (269, 50): 297,
 (116, 105): 298,
 (289, 108): 299,
 (34, 259): 300,
 (108, 108): 301,
 (84, 299): 302,
 (116, 295): 303,
 (294, 32): 304,
 (116, 111): 305,
 (259, 281): 306,
 (306, 303): 307,
 (307, 276): 308,
 (302, 283): 309,
 (101, 115): 310,
 (309, 282): 311,
 (117, 115): 312,
 (11

In [None]:
baT.encode()
baT.decode()

In [14]:
import regex as re

In [19]:
class RegexTokenizer():
  def __init__(self):
    self.merges = {}
    self.GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
    self.gpt4pat = re.compile(self.GPT4_SPLIT_PATTERN)

  

  def get_stat(self, tokens_list):
    stat = {} #(pair):number
    for tokens in tokens_list:
      for pair in zip(tokens, tokens[1:]):
        stat[pair] = stat.get(pair, 0) + 1
    return stat

  def merge(self, tokens_list, pair, idx):
    new_ids_list = []
    for tokens in tokens_list:
      new_ids = []
      i = 0
      while i < len(tokens):
        if tokens[i]==pair[0] and i<len(tokens)-1 and tokens[i+1]==pair[1]:
          new_ids.append(idx)
          i += 2
        else:
          new_ids.append(tokens[i])
          i += 1
      new_ids_list.append(new_ids)
    return new_ids_list


  def train(self, text, vocab_size, verbose=False):
    text_list = re.findall(self.gpt4pat, text)
    tokens_list = []
    for text in text_list:
      tokens = text.encode("utf-8")
      tokens = list(map(int, tokens))
      tokens_list.append(tokens)
    self.merges = {}

    num_merge = vocab_size - 256

    for i in range(num_merge):
      stat = self.get_stat(tokens_list)
      pair = max(stat, key=stat.get)
      idx = i + 256
      tokens_list = self.merge(tokens_list, pair, idx)
      self.merges[pair] = idx

      if verbose:
        print(f"merge {i+1}/{num_merge}: {pair} -> {idx}")


    return self.merges

  def encode(self, text):
    text_list = re.findall(self.gpt4pat, text)
    tokens_list = []
    tokens_concat = []
    for text in text_list:
      tokens = text.encode("utf-8")
      tokens = list(map(int, tokens))

      while len(tokens) >= 2:
          stat = self.get_stat(tokens)
          pair = min(stat, key=lambda p: self.merges.get(p, float('inf')))
          if pair not in self.merges:
              break
          tokens = self.merge(tokens, pair, self.merges[pair])

      tokens_list.append(tokens)
      tokens_concat += tokens

    return tokens_concat

  def decode(self, ids):
    vocab = {idx:bytes([idx]) for idx in range(256)}
    for (p0, p1), idx in self.merges.item():
      vocab[idx] = vocab[p0] + vocab[p1]
    tokens = b"".join(vocab[idx] for idx in ids)
    text = tokens.decode("utf-8", errors='replace')
    return text

In [20]:
reT = RegexTokenizer()
reT.train(text, 512, verbose=True)

merge 1/256: (101, 114) -> 256
merge 2/256: (50, 48) -> 257
merge 3/256: (111, 114) -> 258
merge 4/256: (105, 110) -> 259
merge 5/256: (101, 100) -> 260
merge 6/256: (32, 116) -> 261
merge 7/256: (111, 110) -> 262
merge 8/256: (104, 101) -> 263
merge 9/256: (32, 83) -> 264
merge 10/256: (97, 114) -> 265
merge 11/256: (97, 110) -> 266
merge 12/256: (32, 65) -> 267
merge 13/256: (261, 263) -> 268
merge 14/256: (97, 108) -> 269
merge 15/256: (114, 105) -> 270
merge 16/256: (118, 260) -> 271
merge 17/256: (115, 116) -> 272
merge 18/256: (119, 105) -> 273
merge 19/256: (32, 82) -> 274
merge 20/256: (257, 49) -> 275
merge 21/256: (32, 102) -> 276
merge 22/256: (257, 50) -> 277
merge 23/256: (32, 84) -> 278
merge 24/256: (102, 116) -> 279
merge 25/256: (97, 121) -> 280
merge 26/256: (32, 34) -> 281
merge 27/256: (273, 279) -> 282
merge 28/256: (101, 116) -> 283
merge 29/256: (264, 282) -> 284
merge 30/256: (99, 104) -> 285
merge 31/256: (98, 256) -> 286
merge 32/256: (97, 116) -> 287
merge 33

{(101, 114): 256,
 (50, 48): 257,
 (111, 114): 258,
 (105, 110): 259,
 (101, 100): 260,
 (32, 116): 261,
 (111, 110): 262,
 (104, 101): 263,
 (32, 83): 264,
 (97, 114): 265,
 (97, 110): 266,
 (32, 65): 267,
 (261, 263): 268,
 (97, 108): 269,
 (114, 105): 270,
 (118, 260): 271,
 (115, 116): 272,
 (119, 105): 273,
 (32, 82): 274,
 (257, 49): 275,
 (32, 102): 276,
 (257, 50): 277,
 (32, 84): 278,
 (102, 116): 279,
 (97, 121): 280,
 (32, 34): 281,
 (273, 279): 282,
 (101, 116): 283,
 (264, 282): 284,
 (99, 104): 285,
 (98, 256): 286,
 (97, 116): 287,
 (111, 109): 288,
 (101, 115): 289,
 (101, 110): 290,
 (101, 109): 291,
 (34, 46): 292,
 (32, 40): 293,
 (46, 10): 294,
 (259, 103): 295,
 (108, 258): 296,
 (32, 77): 297,
 (105, 103): 298,
 (32, 262): 299,
 (280, 296): 300,
 (108, 108): 301,
 (270, 101): 302,
 (274, 283): 303,
 (303, 302): 304,
 (304, 271): 305,
 (32, 115): 306,
 (105, 99): 307,
 (266, 100): 308,
 (111, 117): 309,
 (101, 99): 310,
 (32, 97): 311,
 (41, 46): 312,
 (114, 288): 

In [None]:
baT.encode()
baT.decode()

In [None]:
# match this
import tiktoken
enc = tiktoken.get_encoding("cl100k_base") # this is the GPT-4 tokenizer
ids = enc.encode("hello world!!!? (안녕하세요!) lol123 😉")
text = enc.decode(ids) # get the same text back

In [13]:
my_dict = {'a': 1, 'b': 2, 'c': 3}
"""
# Using dict.get() to safely access a value
value = my_dict.get('a')  # Returns 1
print(value)

# Attempting to access a non-existing key with a default value
non_existing = my_dict.get('d', 'Default Value')  # Returns 'Default Value'
print(non_existing)
"""
# Reference to the get method - not commonly used in this form for direct dictionary access
get_method_reference = my_dict.get
print(get_method_reference)


<built-in method get of dict object at 0x0000027B9C38C840>
