In [39]:
from collections import Counter

class BytePairTokenizer:
  def __init__(self, target_vocab_size):
    self.target_vocab_size = target_vocab_size
    self.merges = []
  
  def _compute_word_frequencies(self, corpus):
    """Compute word frequencies from input corpus."""
    return Counter(" ".join(corpus).split())
  
  def _initialize_tokens(self, word_frequency):
    """Initialize tokens as single characters."""
    return {word:list(word) for word in word_frequency}
  
  def _compute_pair_frequencies(self, tokens, word_frequency):
    """Compute frequencies of adjacent token pairs."""
    pair_freq = Counter()
    for word, freq in word_frequency.items():
      token = tokens[word]
      for i in range(len(token) - 1):
        pair_freq[tuple(token[i:i+2])] += freq
    return pair_freq
  
  def _find_best_pair(self, pair_freq):
    """Find most frequent pair in our tokenized corpus"""
    return pair_freq.most_common(1)[0][0] if pair_freq else None

  def _merge_tokens(self, tokens, pair):
    """Replace occurences of a pair of tokens with the merged form."""
    merged_tokens = {}
    merge_str = ''.join(pair)
    for word, token in tokens.items():
      new_token = []
      i = 0
      while i < len(token):
        if i < len(token) - 1 and tuple(token[i:i+2]) == pair:
          new_token.append(merge_str)
          i += 2
        else:
          new_token.append(token[i])
          i += 1
      merged_tokens[word] = new_token
    return merged_tokens

  def fit(self, corpus):
    word_frequency = self._compute_word_frequencies(corpus)
    tokens = self._initialize_tokens(word_frequency)

    while len(self.merges) + len(set("".join(word_frequency.keys()))) < self.target_vocab_size:
      pair_frequency = self._compute_pair_frequencies(tokens, word_frequency)
      best_pair = self._find_best_pair(pair_frequency)
      if not best_pair:
        break
      self.merges.append(best_pair)
      tokens = self._merge_tokens(tokens, best_pair)
  
  def transform(self, corpus):
    """Iteratively merge learnt pairs of tokens in a corpus."""
    merge_dict = {m:m[0]+m[1] for m in self.merges}

    tokenized_corpus = []
    for document in corpus:

        document = list(document)

        for merge_pair, merged_token in merge_dict.items():
          i = 0
          while i < len(document) - 1:
            if tuple(document[i:i + len(merge_pair)]) == merge_pair:
              document = document[:i] + [merged_token] + document[i + len(merge_pair):]
            i+=1
        
        tokenized_corpus.append(document)
    return tokenized_corpus

In [40]:
class Vectorizer:
  def __init__(self):
    self.vocabulary = {}
    self.inverse_vocabulary = {}
  
  def fit(self, tokenized_corpus):
    tokens = [tok for doc in tokenized_corpus for tok in doc]
    token_frequency = Counter(tokens).most_common()

    integer = 1

    for token, _ in token_frequency:
      self.vocabulary[integer] = token
      self.inverse_vocabulary[token] = integer
      integer += 1
  
  def transform(self, tokens):
    vectorized_corpus = []

    for doc in tokens:
      vectorized_document = []

      for tok in doc:
        vectorized_document.append(self.inverse_vocabulary[tok])

      vectorized_corpus.append(vectorized_document)

    return vectorized_corpus
  
  def fit_transform(self, tokens):
    self.fit(tokens)
    return self.transform(tokens)
  
  def inverse_transform(self, tokens):
    tokenized_corpus = []
    for doc in tokens:
      tokenized_document = []
      for tok in doc:
        tokenized_document.append(self.vocabulary[tok])
      tokenized_corpus.append(tokenized_document)
    return tokenized_corpus

In [45]:
corpus = [
  "Hello world!",
  "It is a great day to hug puppies.",
  "It would be a shame to not do so.",
  "This text is full of nonsense: I don't care!",
  "I hope I have enough pair variety here to get an interesting result.",
  "This project is going to be a challenge",
  "This is all about tokenization.",
  "I'm trying to make this tokenization algorithm work",
  "I really I hope this works.",
]

tokenizer = BytePairTokenizer(100)
vectorizer = Vectorizer()

tokenizer.fit(corpus)
tokens = tokenizer.transform(corpus)
sequence = vectorizer.fit_transform(tokens)

print(
  sequence,
  '\n',
  vectorizer.inverse_transform(sequence)
)

[[37, 1, 38], [18, 1, 8, 1, 2, 1, 39, 1, 40, 1, 5, 1, 41, 1, 42], [18, 1, 43, 1, 19, 1, 2, 1, 44, 1, 5, 1, 45, 1, 46, 1, 47], [9, 1, 48, 1, 8, 1, 49, 1, 50, 1, 51, 20, 10, 20, 6, 52, 1, 4, 1, 53, 54, 21, 3, 1, 11, 2, 12, 55], [4, 1, 22, 1, 4, 1, 23, 24, 6, 1, 10, 13, 56, 14, 1, 25, 2, 26, 7, 1, 24, 2, 7, 57, 3, 15, 1, 14, 6, 12, 1, 5, 1, 27, 3, 1, 2, 58, 1, 59, 60, 28, 3, 16, 1, 28, 29, 30, 3, 31], [9, 1, 25, 7, 13, 61, 6, 11, 3, 1, 8, 1, 32, 16, 1, 5, 1, 19, 1, 2, 1, 11, 23, 62, 10, 27], [9, 1, 8, 1, 33, 1, 2, 63, 13, 29, 3, 1, 34, 31], [4, 21, 17, 1, 3, 7, 15, 16, 1, 5, 1, 17, 2, 64, 6, 1, 35, 1, 34, 1, 2, 30, 32, 7, 26, 3, 14, 17, 1, 36], [4, 1, 12, 33, 15, 1, 4, 1, 22, 1, 35, 1, 36, 65]] 
 [['Hello', ' ', 'world!'], ['It', ' ', 'is', ' ', 'a', ' ', 'great', ' ', 'day', ' ', 'to', ' ', 'hug', ' ', 'puppies.'], ['It', ' ', 'would', ' ', 'be', ' ', 'a', ' ', 'shame', ' ', 'to', ' ', 'not', ' ', 'do', ' ', 'so.'], ['This', ' ', 'text', ' ', 'is', ' ', 'full', ' ', 'of', ' ', 'non', 's'