In [4]:
import tokenizer as tok_tests
from typing import List
from collections import Counter
import re
import json

In [5]:
sentence = "Buck and Tao drove around in their firetruck, which was painted bright red 🔥."

In [6]:
def split_into_tokens(s: str) -> List[str]:
    return re.findall(r"\w+|[^\w\s]", s)

In [7]:
def corpus_common_tokens(strings: List[str], n: int = 30000) -> List[str]:
    c = Counter()
    for s in strings:
        tokens = split_into_tokens(s)
        for token in tokens:
            c[token] += 1
    
    return [t[0] for t in c.most_common(n)]

tok_tests.test_tokenizer_from_corpus_fn(corpus_common_tokens)

You passed the test!!!


In [8]:
class Tokenizer():
    def __init__(self, token_list: List):
        self.token_list = token_list
        self.decoder = {i["id"]: i["piece"] for i in token_list}
        self.encoder = {i["piece"]: i["id"] for i in token_list}

    def decode(self, ids: List[int]) -> str:
        return " ".join([self.decoder[id] for id in ids])

    def tokenize(self, s: str) -> List[int]:
        tokens = split_into_tokens(s)
        return [self.encoder.get(token, 3) for token in tokens]

In [14]:
with open("bpe_tokens.json") as f:
    bpe_tokens = json.load(f)

In [27]:
tokenizer = BPETokenizer(bpe_tokens)
tokenizer.decode(tokenizer.tokenize(sentence))

'Bu ck  and  Tao  drove  around  in  their  fire tr uck ,  which  was  painted  bright  red   $ .'

In [26]:
class BPETokenizer(Tokenizer):

    def __init__(self, token_list):
        super().__init__(token_list)

    def tokenize(self, s: str) -> List[int]:
        vocab = self.encoder.keys()
        pieces = list(s)
        done = False
        while not done:
            done = True
            for i, (p, q) in enumerate(zip(pieces[:-1], pieces[1:])):
                if (p + q) in vocab:
                    done = False
                    pieces = pieces[:i] + [p + q] + pieces[i + 2:]
                    break
        return [self.encoder.get(piece, 3) for piece in pieces]


In [12]:
pieces = [0, 1, 2, 3]
pieces[4:]

[]