In [1]:
import re
from collections import defaultdict

In [2]:
# https://everdark.github.io/k9/notebooks/ml/natural_language_understanding/subword_units/subword_units.nb.html
class BytePairEncoding:
    def __init__(self, sents, min_cnt=3, verbose=False):
        self.verbose = verbose
        init_vocab = defaultdict(int)
        for sent in sents:
            words = re.split(r"\W+", sent)
            for w in words:
                if w != "":
                    init_vocab[w] += 1
        # Create fullword vocabulary
        self.word_vocab = {k: v for k, v in init_vocab.items() if v >= min_cnt}
        # Insert space between each char in a word for ease of merge operation down the line
        # We directly borrow the idea from https://www.aclweb.org/anthology/P16-1162
        self.working_vocab = {" ".join(k): v for k, v in self.word_vocab.items()}
        self.subword_vocab = defaultdict(int)
        # Also build a character-level vocabulary as the base subwords
        self.char_vocab = defaultdict(int)
        for sent in sents:
            for char in list(sent):
                self.char_vocab[char] += 1

    def _find_top_subword(self):
        subword_pairs = defaultdict(int)
        for w, cnt in self.working_vocab.items():
            subw = w.split()
            for i in range(len(subw) - 1):
                # Count bigrams
                subword_pairs[subw[i], subw[i + 1]] += cnt
        top_subw_pair = max(subword_pairs, key=subword_pairs.get)
        top_subw = "".join(top_subw_pair)
        self.subword_vocab[top_subw] = subword_pairs[top_subw_pair]
        if self.verbose:
            print(f"New subword added: {top_subw}")

        return top_subw_pair

    def _merge(self, subw_pair):
        bigram = re.escape(" ".join(subw_pair))
        p = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
        self.working_vocab = {
            p.sub("".join(subw_pair), w): cnt for w, cnt in self.working_vocab.items()
        }

    def update_subword(self, n_merge=1):
        for _ in range(n_merge):
            top_subw_pair = self._find_top_subword()
            self._merge(top_subw_pair)

In [3]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

import os
import shutil
import tensorflow as tf

In [4]:
shakes_file = "../data/shakespeare.txt"
if not os.path.exists(shakes_file):
    shakes_dl_path = tf.keras.utils.get_file(
        "shakespeare.txt",
        "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt",
    )
    shutil.move(shakes_dl_path, shakes_file)

shakespeare = open(shakes_file, "rb").read().decode(encoding="utf-8")
shakespeare = shakespeare.lower().split("\n")

# Print the first few lines
for sent in shakespeare[:20]:
    print(sent)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
first citizen:
before we proceed any further, hear me speak.

all:
speak, speak.

first citizen:
you are all resolved rather to die than to famish?

all:
resolved. resolved.

first citizen:
first, you know caius marcius is chief enemy to the people.

all:
we know't, we know't.

first citizen:
let us kill him, and we'll have corn at our own price.


In [5]:
bpe = BytePairEncoding(shakespeare, min_cnt=10)
print(len(bpe.word_vocab))

1872


In [6]:
# Print some from fullword vocabulary
print(list(bpe.word_vocab.items())[:5])

[('first', 363), ('citizen', 100), ('before', 195), ('we', 938), ('proceed', 21)]


In [7]:
# (For debugging) Print some from the working vocab that we are going to perform the merge
print(list(bpe.working_vocab.items())[:5])

[('f i r s t', 363), ('c i t i z e n', 100), ('b e f o r e', 195), ('w e', 938), ('p r o c e e d', 21)]


In [8]:
# Do merge update
bpe.update_subword(n_merge=100)
print(len(bpe.subword_vocab))

100


In [9]:
# Check the working vocabulary after merge
print(list(bpe.working_vocab.items())[:5])

[('f ir st', 363), ('c it i z en', 100), ('be for e', 195), ('we', 938), ('p ro ce ed', 21)]


In [10]:
# Print some subwords generated by the first 100 merge operations
print(list(bpe.subword_vocab.items())[:5])

[('th', 25186), ('ou', 11960), ('the', 11654), ('an', 11587), ('in', 9012)]
