In [2]:
import re
import tokenizer as tok_tests
from collections import Counter

def to_tokens(s):
    return re.findall(r"\w+|[^\w\s]", s)

def corpus_common_tokens(strings_list):
    c = Counter()
    for s in strings_list:
        c.update(to_tokens(s))
    return [i[0] for i in c.most_common(30000)]

tok_tests.test_tokenizer_from_corpus_fn(corpus_common_tokens)

class Tokenizer:
    def __init__(self, token_list):
        self.token_list = token_list
        # print(token_list)
        self.vocab = {i["piece"]: i["id"] for i in token_list}
        self.vocab_by_id = {i["id"]: i["piece"] for i in token_list}

    def decode(self, ids):
        return ' '.join(self.vocab_by_id[i] for i in ids)
    
    def tokenize(self, s):
        return [self.vocab.get(i, 3) for i in to_tokens(s)]


#tok_tests.test_tokenizer(Tokenizer)
    

In [17]:
class BPETokenizer(Tokenizer):
    def __init__(self, token_list):
        super().__init__(token_list)
    
    def tokenize(self, text):
        stack = []
        for curr_token in list(text):
            if len(stack) == 0:
                stack.append(curr_token)
            else:
                prev_token = stack[-1]
                combined_token = prev_token + curr_token
                if combined_token in self.vocab:
                    stack.pop()
                    stack.append(combined_token)
                else:
                    stack.append(curr_token)
        return [self.vocab.get(token, None) for token in stack]
    
    def decode(self, ids):
        return ''.join(self.vocab_by_id.get(i, "<UNK>") for i in ids)
                
    @classmethod
    def from_corpus(cls, corpus, merges=1000):
        tokens = set()
        for word in corpus:
            tokens.update(word) # adds all chars to `tokens`

        token_mapping = {c: idx for idx, c in enumerate(tokens)}
        token_mapping_inv = list(token_mapping)
        # transform each word to a list of token IDs
        corpus = [[token_mapping[c] for c in word] for word in corpus]

        # Finds instances of xy in l and replaces with z
        def do_replace(x, y, z, l):
            stack = []
            for i in l:
                if stack and i == y and stack[-1] == x:
                    stack.pop()
                    stack.append(z)
                else:
                    stack.append(i)
            return stack

        for t in range(merges):
            c = Counter()
            for i in corpus:
                # generate pairs of adjacent tokens
                c.update(zip(i, i[1:]))
            x = c.most_common(1)[0][0]
            p = token_mapping_inv[x[0]] + token_mapping_inv[x[1]]
            new_id = len(token_mapping)
            token_mapping[p] = new_id
            token_mapping_inv.append(p)
            corpus = [do_replace(*x, new_id, i) for i in corpus]
        tokens = [{"piece": k, "id": v} for k, v in token_mapping.items()]
        return cls(tokens)

# tok_tests.test_bpe_tokenizer_from_corpus(BPETokenizer)

from pathlib import Path
corpus = open(Path.home() / "mlab/days/w2d4/shakespeare.txt").readlines()
minicorpus = corpus[5000:6000]
tokenizer = BPETokenizer.from_corpus(minicorpus)
sentence = "Buck and Tao drove around in their firetruck, which was painted bright red 🔥."
tokens = tokenizer.tokenize(sentence)
print(tokens)
decoded = tokenizer.decode(tokens)
print(decoded)

[56, 48, 526, 85, 82, 18, 14, 88, 60, 162, 110, 99, 320, 82, 133, 236, 468, 476, 331, 254, 383, 80, 39, 93, 188, 347, 78, 389, 666, 462, 58, 185, 148, 77, 108, 82, None, 49]
Buck and Tao drove around in their firetruck, which was painted bright red <UNK>.
