## Implementing the WordPiece Tokenization Technique

Improvment: Enhanced Suffix Array

### Defining the Corpus

In [2]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be ableabab to understand how they are trained and generate tokens."
]

### Removing Punctuations

In [3]:
from string import punctuation

def cleaning(text):
    for p in punctuation + ' ' + '\n':
        text = ' '.join([sub.strip() for sub in text.split(p) if len(sub) > 1])

    return text

# https://towardsdatascience.com/difference-between-nfd-nfc-nfkd-and-nfkc-explained-with-python-code-e2631f96ae6c

In [4]:
corpus = [cleaning(text) for text in corpus]

corpus

['This is the Hugging Face Course',
 'This chapter is about tokenization',
 'This section shows several tokenizer algorithms',
 'Hopefully you will be ableabab to understand how they are trained and generate tokens']

### Calculating the Frequency of each word in the Corpus

In [5]:
from collections import defaultdict


word_freqs = defaultdict(lambda : 0)
for text in corpus:
    for word in text.split():
        word_freqs[word] += 1

word_freqs

defaultdict(<function __main__.<lambda>()>,
            {'This': 3,
             'is': 2,
             'the': 1,
             'Hugging': 1,
             'Face': 1,
             'Course': 1,
             'chapter': 1,
             'about': 1,
             'tokenization': 1,
             'section': 1,
             'shows': 1,
             'several': 1,
             'tokenizer': 1,
             'algorithms': 1,
             'Hopefully': 1,
             'you': 1,
             'will': 1,
             'be': 1,
             'ableabab': 1,
             'to': 1,
             'understand': 1,
             'how': 1,
             'they': 1,
             'are': 1,
             'trained': 1,
             'and': 1,
             'generate': 1,
             'tokens': 1})

### Splitting those Words into Character Level

In [6]:
splits = {word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)] for word in word_freqs.keys()}

splits

{'This': ['T', '##h', '##i', '##s'],
 'is': ['i', '##s'],
 'the': ['t', '##h', '##e'],
 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'],
 'Face': ['F', '##a', '##c', '##e'],
 'Course': ['C', '##o', '##u', '##r', '##s', '##e'],
 'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'],
 'about': ['a', '##b', '##o', '##u', '##t'],
 'tokenization': ['t',
  '##o',
  '##k',
  '##e',
  '##n',
  '##i',
  '##z',
  '##a',
  '##t',
  '##i',
  '##o',
  '##n'],
 'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'],
 'shows': ['s', '##h', '##o', '##w', '##s'],
 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'],
 'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'],
 'algorithms': ['a',
  '##l',
  '##g',
  '##o',
  '##r',
  '##i',
  '##t',
  '##h',
  '##m',
  '##s'],
 'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'],
 'you': ['y', '##o', '##u'],
 'will': ['w', '##i', '##l', '##l'],
 'be': ['b', '##e'],
 'ableabab': ['a', '##

### Computing Pair Scores: $\cfrac{freq \; of \; pair}{freq \; of \; first \; element \times freq \; of \; second \; element}$

In [7]:
def pair_score(splits):
    token_freqs = defaultdict(lambda: 0) # Capturing the global corpus statistics for `freq of firsy element` and `freq of second element`
    pair_freqs = defaultdict(lambda: 0)   # Capturing the statistics for `freq of pair`: How many times this pair appears in the corpus

    # Iterate over all words of the corpus
    for word, freq in word_freqs.items():
        split = splits[word]

        # If a word contains only 1 letter
        if len(split) == 1:
            token_freqs[split[0]] += freq
            continue

        for i in range(len(split) - 1):
            token_freqs[split[i]] += freq
            pair_freqs[(split[i], split[i+1])] += freq

        # Adding the final token that the for-loop is not processing
        token_freqs[split[-1]] += freq

    # Returning the scores, calculated from the formula above
    return {pair: pair_freq / (token_freqs[pair[0]] * token_freqs[pair[1]]) for pair, pair_freq in pair_freqs.items()}



In [8]:
ps = pair_score(splits)

ps

{('T', '##h'): 0.125,
 ('##h', '##i'): 0.03409090909090909,
 ('##i', '##s'): 0.02727272727272727,
 ('i', '##s'): 0.1,
 ('t', '##h'): 0.03571428571428571,
 ('##h', '##e'): 0.011904761904761904,
 ('H', '##u'): 0.1,
 ('##u', '##g'): 0.05,
 ('##g', '##g'): 0.0625,
 ('##g', '##i'): 0.022727272727272728,
 ('##i', '##n'): 0.01652892561983471,
 ('##n', '##g'): 0.022727272727272728,
 ('F', '##a'): 0.1111111111111111,
 ('##a', '##c'): 0.05555555555555555,
 ('##c', '##e'): 0.023809523809523808,
 ('C', '##o'): 0.07692307692307693,
 ('##o', '##u'): 0.046153846153846156,
 ('##u', '##r'): 0.022222222222222223,
 ('##r', '##s'): 0.022222222222222223,
 ('##s', '##e'): 0.004761904761904762,
 ('c', '##h'): 0.125,
 ('##h', '##a'): 0.013888888888888888,
 ('##a', '##p'): 0.05555555555555555,
 ('##p', '##t'): 0.07142857142857142,
 ('##t', '##e'): 0.013605442176870748,
 ('##e', '##r'): 0.026455026455026454,
 ('a', '##b'): 0.1,
 ('##b', '##o'): 0.019230769230769232,
 ('##u', '##t'): 0.02857142857142857,
 ('t', 

### Creating a Function that returns the Pair with the higher Score

In [9]:
def best_score(pair_scores):
    max_pair = ('', '')
    max_freq = 0
    for pair, freq in pair_scores.items():
        if freq > max_freq:
            max_freq = freq
            max_pair = pair
    
    return max_pair, max_freq

In [10]:
best_score(ps)

(('##f', '##u'), 0.2)

### Creating a Function that Merge Tokens 

In [11]:
def merge_pair(pair_t, splits):
    # Iterating over the word-tokens defaultdict
    for word in splits.keys():
        split = splits[word] # contains the tokens of the word

        if len(split) == 1:
            continue
        
        # Iterating until we find the pair in the tokenize representation of the word
        i = 0
        while i < len(split) - 1:
            if (split[i] == pair_t[0]) and (split[i+1] == pair_t[1]):
                split = split[:i] + [pair_t[0] + pair_t[1][2:]] + split[i+2:]
            else:
                i += 1

        splits[word] = split
    return splits

In [12]:
splits["ableabab"]

['a', '##b', '##l', '##e', '##a', '##b', '##a', '##b']

In [13]:
s = merge_pair(("##a", "##b"), splits)

print(s["ableabab"])

['a', '##b', '##l', '##e', '##ab', '##ab']


### Creating the Vocabulary

In [14]:
def create_vocab(desire_len):
    # Original splits (character tokenization of each word in the corpus)
    splits = {word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)] for word in word_freqs.keys()}

    # Creating the basic vocabulary
    vocab_ = set([token for tokens in splits.values() for token in tokens])
    
    i = len(vocab_)
    new_tokens = []
    while i < desire_len:
        ps = pair_score(splits)          # Calculate the Score of each pair
        bs, _ = best_score(ps)           # Get the pair with the highest score
        splits = merge_pair(bs, splits)  # Merge those two tokens

        new_tokens.append(bs[0] + bs[1][2:])
        i += 1

    # Adding the the basic vocabulary the new tokens
    vocab_ = ["[CLS]", "[PAD]", "[UNK]"] + sorted(list((vocab_ | set(new_tokens))))
    return {term: i for i, term in enumerate(vocab_)}, vocab_

In [15]:
vocab, vocab_ = create_vocab(70_000)

In [16]:
len(vocab_)
print(vocab_)

['[CLS]', '[PAD]', '[UNK]', '', '##a', '##ab', '##abab', '##at', '##b', '##c', '##ct', '##cti', '##ctio', '##ction', '##d', '##e', '##f', '##fu', '##ful', '##full', '##fully', '##g', '##h', '##hm', '##i', '##ithms', '##iz', '##izat', '##izati', '##k', '##l', '##lg', '##ll', '##m', '##n', '##niz', '##nizati', '##nizatio', '##nization', '##ns', '##o', '##p', '##r', '##ra', '##ral', '##rat', '##rithms', '##rsta', '##rstan', '##rstand', '##s', '##sta', '##t', '##ta', '##thm', '##thms', '##u', '##ur', '##urs', '##ut', '##v', '##w', '##ws', '##y', '##z', '##za', '##zat', 'C', 'Co', 'Cours', 'Course', 'F', 'Fa', 'Fac', 'Face', 'H', 'Ho', 'Hop', 'Hope', 'Hopefully', 'Hu', 'Hug', 'Hugg', 'Huggi', 'Huggin', 'Hugging', 'T', 'Th', 'Thi', 'This', 'a', 'ab', 'abl', 'able', 'ableabab', 'abo', 'about', 'alg', 'algo', 'algorithms', 'an', 'and', 'ar', 'are', 'b', 'be', 'c', 'ch', 'cha', 'chap', 'chapt', 'chapte', 'chapter', 'g', 'ge', 'gen', 'gene', 'generat', 'generate', 'h', 'ho', 'how', 'i', 'is', 's

In [17]:
print(vocab)

{'[CLS]': 0, '[PAD]': 1, '[UNK]': 2, '': 3, '##a': 4, '##ab': 5, '##abab': 6, '##at': 7, '##b': 8, '##c': 9, '##ct': 10, '##cti': 11, '##ctio': 12, '##ction': 13, '##d': 14, '##e': 15, '##f': 16, '##fu': 17, '##ful': 18, '##full': 19, '##fully': 20, '##g': 21, '##h': 22, '##hm': 23, '##i': 24, '##ithms': 25, '##iz': 26, '##izat': 27, '##izati': 28, '##k': 29, '##l': 30, '##lg': 31, '##ll': 32, '##m': 33, '##n': 34, '##niz': 35, '##nizati': 36, '##nizatio': 37, '##nization': 38, '##ns': 39, '##o': 40, '##p': 41, '##r': 42, '##ra': 43, '##ral': 44, '##rat': 45, '##rithms': 46, '##rsta': 47, '##rstan': 48, '##rstand': 49, '##s': 50, '##sta': 51, '##t': 52, '##ta': 53, '##thm': 54, '##thms': 55, '##u': 56, '##ur': 57, '##urs': 58, '##ut': 59, '##v': 60, '##w': 61, '##ws': 62, '##y': 63, '##z': 64, '##za': 65, '##zat': 66, 'C': 67, 'Co': 68, 'Cours': 69, 'Course': 70, 'F': 71, 'Fa': 72, 'Fac': 73, 'Face': 74, 'H': 75, 'Ho': 76, 'Hop': 77, 'Hope': 78, 'Hopefully': 79, 'Hu': 80, 'Hug': 81, 'H

### Creating a Function to Tokenize Words 


In [18]:
def tokenize_word(word):
    tokens = []

    # Iterating over the entire word starting from the end
    while len(word) > 0:
        i = len(word)
        # Trying to find the bigest sub-word that exists on our vocabulary
        while (i > 0) and word[:i] not in vocab_:
            i -= 1

        # If a sub-word does not exist on the vocabulary
        if i == 0:
            # Keeping some information about the word
            tokens.append("[UNK]")
            return tokens
        
        # The first sub-word is not going to contain `##`
        tokens.append(word[:i])
        word = word[i:]

        # All the other sub-words are going to contain `##`
        if len(word) > 0:
            word = f"##{word}"
    
    return tokens

In [19]:
tokenize_word("Hugging")

['Hugging']

In [20]:
tokenize_word("Huggi0ng")

['Huggi', '[UNK]']

### Creating a Function to Tokenize Text

In [21]:
def tokenize_text(text):
    return [token for word in cleaning(text).split() for token in tokenize_word(word)]

In [22]:
print(tokenize_text("This is the Hugging Face course!"))

['This', 'is', 'the', 'Hugging', 'Face', 'c', '##o', '##urs', '##e']


## Creating a Class that Contains everything we Discussed

In [23]:
from tqdm import tqdm

In [30]:
from collections import defaultdict
from tqdm import tqdm
import json


class WordPiece:
    def __init__(self, corpus=None, ntokens=30_000, cleaning=lambda text: text):
        if corpus is not None:
            # Cleaning Corpus
            corpus = [cleaning(text) for text in corpus]

            # Calculating the frequencies of each word (global statistics)
            self._word_freqs = defaultdict(lambda : 0)
            for text in corpus:
                for word in text.split():
                    self._word_freqs[word] += 1

        self._cleaning = cleaning
        self._ntokens = ntokens
        self.special_t = ["[CLS]", "[UNK]", "[PAD]", "[SEP]"]
        self.vocab_l = []
        self.vocab_d = {}
        self.ivocab_d = {}
        

    def __calc_pair_scores(self, splits):
        token_freqs = defaultdict(lambda: 0)  # Capturing the global corpus statistics for `freq of firsy element` and `freq of second element`
        pair_freqs = defaultdict(lambda: 0)   # Capturing the statistics for `freq of pair`: How many times this pair appears in the corpus

        # Iterate over all words of the corpus
        for word, freq in self._word_freqs.items():
            split = splits[word]

            # If a word contains only 1 letter
            if len(split) == 1:
                token_freqs[split[0]] += freq
                continue

            for i in range(len(split) - 1):
                token_freqs[split[i]] += freq
                pair_freqs[(split[i], split[i+1])] += freq

            # Adding the final token that the for-loop is not processing
            token_freqs[split[-1]] += freq

        # Returning the scores, calculated from the formula above
        return {pair: pair_freq / (token_freqs[pair[0]] * token_freqs[pair[1]]) for pair, pair_freq in pair_freqs.items()}


    @staticmethod
    def __highest_score(pair_scores):
        max_pair = ('', '')
        max_freq = 0
        for pair, freq in pair_scores.items():
            if freq > max_freq:
                max_freq = freq
                max_pair = pair
        
        return max_pair, max_freq
    
    @staticmethod
    def __merge_pair(pair_tuple, splits):
        # Iterating over the word-tokens defaultdict
        for word in splits.keys():
            split = splits[word] # contains the tokens of the word

            if len(split) == 1:
                continue
            
            # Iterating until we find the pair in the tokenize representation of the word
            i = 0
            while i < len(split) - 1:
                if (split[i] == pair_tuple[0]) and (split[i+1] == pair_tuple[1]):
                    merge = pair_tuple[0] + pair_tuple[1][2:] if pair_tuple[1].startswith("##") else pair_tuple[0] + pair_tuple[1]
                    split = split[:i] + [merge] + split[i+2:]
                else:
                    i += 1

            splits[word] = split
        return splits
    

    def fit(self):
        # Original splits (character tokenization of each word in the corpus)
        splits = {word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)] for word in self._word_freqs.keys()}

        # Creating the basic vocabulary
        vocab_ = set([token for tokens in splits.values() for token in tokens])
        
        new_tokens = []
        for _ in tqdm(range(len(vocab_), self._ntokens), desc="Creating Vocabulary: "):
            ps = self.__calc_pair_scores(splits)   # Calculate the Score of each pair
            bs, _ = self.__highest_score(ps)       # Get the pair with the highest score
            splits = self.__merge_pair(bs, splits) # Merge those two tokens

            if bs[1].startswith("##"):
                new_tokens.append(bs[0] + bs[1][2:])
                continue
            new_tokens.append(bs[0] + bs[1])

        # Adding to the the basic vocabulary the new tokens
        self.vocab_l = self.special_t + sorted(list((vocab_ | set(new_tokens))))
        self.vocab_d = {term: i for i, term in enumerate(self.vocab_l)}
        self.ivocab_d = {i: term for i, term in enumerate(self.vocab_l)}

    
    def save_vocab(self, path):
        # Saving the Vocabulary Dict into a JSON file
        with open(path, "w") as f:
            json.dump(self.vocab_d, f)

    def load_vocab(self, path):
        # Updating the Vocabulary elements from the JSON file
        with open(path, "r") as f:
            self.vocab_d = json.loads(f.read())
        self.vocab_l = list(self.vocab_d.keys())
        self.ivocab_d = {i: token for i, token in enumerate(self.vocab_l)}


    def __tokenize_word(self, word):
        tokens = []

        # Iterating over the entire word starting from the end
        while len(word) > 0:
            i = len(word)
            # Trying to find the bigest sub-word that exists on our vocabulary
            while (i > 0) and word[:i] not in self.vocab_l:
                i -= 1

            # If a sub-word does not exist on the vocabulary
            if i == 0:
                tokens.append("[UNK]")
                return tokens          # keeping some information about the word
            
            # The first sub-word is not going to contain `##`
            tokens.append(word[:i])
            word = word[i:]

            # All the other sub-words are going to contain `##`
            if len(word) > 0:
                word = f"##{word}"

        return tokens
    
    def __decode_word(self, idx):
        to_tokens = [self.ivocab_d[i] for i in idx]
        return to_tokens[0] + ''.join([token[2:] if (token not in self.special_t) else token for token in to_tokens[1:]])
    

    def tokenize(self, text, _npad=None):
        t_text = []
        for word in self._cleaning(text).split():
            for token in self.__tokenize_word(word):
                t_text.append(token)
            t_text.append("[SEP]")

        if _npad is not None:
            for _ in range(_npad - len(t_text)):
                t_text.append("[PAD]")

        return t_text

    def encode(self, text, _npad=None):
        return [self.vocab_d[token] for token in self.tokenize(text, _npad=_npad)]

    def decode(self, idx):
        text = ""
        i, j = 0, 0
        while i < len(idx) and idx[i] != self.vocab_d["[PAD]"]:
            if idx[i] == self.vocab_d["[SEP]"]:
                text += self.__decode_word(idx[j: i]) + " "
                j = i + 1
            i += 1

        return text

In [31]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be ableabab to understand how they are trained and generate tokens.",
]

In [32]:
tokenizer = WordPiece(corpus=corpus, _ntokens=70)

In [33]:
tokenizer.fit()

100%|██████████| 30/30 [00:00<00:00, 6314.19it/s]


In [35]:
print(tokenizer.tokenize("Hugging"))
print(tokenizer.encode("Hugging"))
print(tokenizer.decode([52, 22, 26, 19, 3]))

['Hugg', '##i', '##n', '##g', '[SEP]']
[52, 22, 26, 19, 3]
Hugging 


In [37]:
print(tokenizer.tokenize("Huggi0ng"))
print(tokenizer.encode("Huggi0ng"))
print(tokenizer.decode([52, 22, 1, 3]))

['Hugg', '##i', '[UNK]', '[SEP]']
[52, 22, 1, 3]
Huggi[UNK] 


In [38]:
print(tokenizer.tokenize("This is the Hugging Face course!"))
print(tokenizer.encode("This is the Hugging Face course!"))
print(tokenizer.decode([54, 22, 30, 3, 66, 3, 70, 13, 3, 52, 22, 26, 19, 3, 48, 13, 3, 58, 27, 36, 29, 30, 13, 1, 3]))

['Th', '##i', '##s', '[SEP]', 'is', '[SEP]', 'th', '##e', '[SEP]', 'Hugg', '##i', '##n', '##g', '[SEP]', 'Fac', '##e', '[SEP]', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]', '[SEP]']
[54, 22, 30, 3, 66, 3, 70, 13, 3, 52, 22, 26, 19, 3, 48, 13, 3, 58, 27, 36, 29, 30, 13, 1, 3]
This is the Hugging Face course[UNK] 


In [39]:
print(tokenizer.tokenize("This is the Hugging Face course!", _npad=100))
print(tokenizer.encode("This is the Hugging Face course!", _npad=100))
print(tokenizer.decode([54, 22, 30, 3, 66, 3, 70, 13, 3, 52, 22, 26, 19, 3, 48, 13, 3, 58, 27, 36, 29, 30, 13, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))

['Th', '##i', '##s', '[SEP]', 'is', '[SEP]', 'th', '##e', '[SEP]', 'Hugg', '##i', '##n', '##g', '[SEP]', 'Fac', '##e', '[SEP]', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
[54, 22, 30, 3, 66, 3, 70, 13, 3, 52, 22, 26, 19, 3, 48, 13, 3, 58, 27, 36, 29, 30, 13, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 