Cannot wait to be building my own GPT tokenizer!

In [18]:
class BasicTokenizer:
    def _get_pair_rankings(self, text_bytes):
        """count the occurances for each pair in the input"""
         # generate a list of the most common byte-pairs
        encountered_pairs = {}
        for pair in zip(text_bytes, text_bytes[1:]): # neat way of iterating through the pairs
            encountered_pairs[pair] = encountered_pairs.get(pair, 0) + 1

        # sort the pairs
        encountered_pairs = list(sorted(encountered_pairs.items(), key=lambda x: x[1], reverse=True))
        return encountered_pairs
    

    def _merge_bp(self, text_bytes, encountered_pairs, next_token, verbose):
        """perform a merge on the most frequently occuring byte pair"""
        merges = {}

         # choose the pair that has the highest count and do compression
        top_pair_and_count = encountered_pairs[0]
        skip_next = False
        new_sequence = []

        for pair in zip(text_bytes, text_bytes[1:]):
            if skip_next:
                skip_next = False
                continue

            if pair == top_pair_and_count[0]:
                new_sequence.append(next_token)
                # since 2 tokens were compressed into 1, skip the next token
                skip_next = True

            else:
                new_sequence.append(pair[0])

        if verbose:
            print(f"Merged {top_pair_and_count[0]} into {next_token}.")

        merges[next_token] = top_pair_and_count[0]
        
        # account for the last token that was not zipped 
        new_sequence = new_sequence + [text_bytes[-1]] if not skip_next else []

        return new_sequence, merges

    def train(self, text, vocab_size, verbose=False):
        """Train the tokenizer on your own text sequence"""
        # start the tokenizer by changing the code to utf-8 encodings
        text_bytes = list(text.encode("utf-8"))
        merges = {}
        
        next_token = 256

        while next_token < vocab_size:
            encountered_pairs = self._get_pair_rankings(text_bytes)
            # important: after each round of encoding, update the series of text bytes and record the bpe that occured
            text_bytes, new_merge = self._merge_bp(text_bytes, encountered_pairs, next_token, verbose)
            merges = {**merges, **new_merge}
            next_token += 1
    
        return text_bytes

In [20]:
tokenizer = BasicTokenizer()
training_set = open("../taylorswift.txt", "r", encoding="utf-8").read()

r = tokenizer.train(training_set, 500, True)

Merged (101, 32) into 256.
Merged (44, 32) into 257.
Merged (100, 32) into 258.
Merged (46, 32) into 259.
Merged (114, 32) into 260.
Merged (50, 48) into 261.
Merged (115, 32) into 262.
Merged (105, 110) into 263.
Merged (111, 110) into 264.
Merged (114, 105) into 265.
Merged (116, 32) into 266.
Merged (116, 104) into 267.
Merged (101, 258) into 268.
Merged (257, 261) into 269.
Merged (97, 110) into 270.
Merged (97, 114) into 271.
Merged (101, 260) into 272.
Merged (121, 32) into 273.
Merged (97, 108) into 274.
Merged (267, 256) into 275.
Merged (118, 268) into 276.
Merged (119, 105) into 277.
Merged (101, 114) into 278.
Merged (264, 32) into 279.
Merged (277, 102) into 280.
Merged (82, 101) into 281.
Merged (83, 280) into 282.
Merged (111, 260) into 283.
Merged (99, 104) into 284.
Merged (269, 49) into 285.
Merged (111, 109) into 286.
Merged (98, 272) into 287.
Merged (32, 275) into 288.
Merged (97, 121) into 289.
Merged (101, 110) into 290.
Merged (111, 114) into 291.
Merged (274, 32