In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=True)
tokenizer("Hello world")["input_ids"]

In [None]:
tokenizer("Hello world")

In [None]:
tokenizer("Hello worldly beings")

In [None]:
tokenizer("Hello otherworldly beings")

In [None]:
type(tokenizer)

In [None]:
tokenizer.decode([15496, 995])

In [None]:
for text in ('Hello world', 'Hello worldly beings', 'Hello otherworldly beings'):
    print(tokenizer.decode(tokenizer(text)['input_ids']))

In [None]:
for text in ('Hello world', 'Hello worldly beings', 'Hello otherworldly beings'):
    print(tokenizer.encode(text))

In [None]:
for text in ('Hello world', 'Hello worldly beings', 'Hello otherworldly beings'):
    print([tokenizer.decode(id) for id in tokenizer(text, is_split_into_words=True)['input_ids']])

In [None]:
for text in ('Hello world', 'Hello worldly beings', 'Hello otherworldly beings'):
    print(tokenizer.convert_ids_to_tokens(tokenizer(text)['input_ids']))

In [None]:
tokenizer.get_added_vocab()

In [None]:
for text in ('Hello world', 'Hello worldly beings', 'Hello otherworldly beings'):
    print(tokenizer.tokenize(text))

In [None]:
def clean_tokenized_text(tokenized_text):
    words = [wd.replace('Ġ', ' ') if wd.startswith('Ġ') else '#' + wd for wd in tokenized_text]
    return ''.join(words)

# Pre-trained tokenizer on sample text

In [None]:
import re

with open('28_sample_en_text.txt') as f:
    text = f.read()

# Mock the format of the bibles
text = text.replace(',', ' ,').replace(';', ' ;').replace('(', '( ').replace(')', ' )').replace("'s", " ' s")

text = re.sub(
    pattern='(,)(\S)', 
    repl=', \\2', 
    string=text
)

print(clean_tokenized_text(tokenizer.tokenize(text)))

It's clear that GPT was trained with a high number of merges, because there are barely any words that get split. Still, there are some; here's an example:

In [None]:
tokenizer.tokenize('debutant')

I bet that if I train a BPE encoder with the bible with a low number of merges, there will be many more splits. The question is how much time it would take to train with the maximum number of merges.

# Train a tokenizer on a (fragment of a) bible

In [None]:
from word_splitting import train_tokenizer

In [None]:
mock_verses = [(el + ' .').split() for el in text.split('.')]

In [None]:
n_merges = 972

In [None]:
my_tokenizer = train_tokenizer(mock_verses, len(set(text)) + n_merges)

In [None]:
' '.join(my_tokenizer.encode(text).tokens)

This is a pretty good result, although there are some unexpected splits. But maybe they would have been merged at a later stage.

Note that, after 450 merges, "debutant" is split into "de butant", which is different from the pre-trained tokenizer above. But, to be fair, the training data is vastly different (in quality and in quantity).

# Retrieve the training history, i.e., the merge steps

In [None]:
my_tokenizer.model.save('WordSplitting/output', f'bpe_model_{n_merges}')

This allows saving the final vocabulary (after merges) and the list of merges in historic order. This is almost exactly what we want. There are two items left to be figured out:

1. How many steps do we need to run in order to complete all the merges? Or, put another way, how can we check if we have reached all merges?

2. What is the exact format that we need for the calculations that come afterwards? I need to check my old code for word-pasting and word-splitting.

# Completing all the merges

In [None]:
from word_splitting import encode_verses

In [None]:
encoded_verse_tokens = encode_verses(mock_verses, my_tokenizer)

In [None]:
assert len(encoded_verse_tokens) == len(mock_verses)

In [None]:
for verse_ix, verse_tokens in enumerate(encoded_verse_tokens):
    for token_ix, token in enumerate(verse_tokens):
        if mock_verses[verse_ix][token_ix] != token:
            print('Different', mock_verses[verse_ix][token_ix], token)
            break

In [None]:
from word_splitting import has_completed_merges

In [None]:
has_completed_merges(mock_verses, my_tokenizer)

## Checking this for a book of the bible

In [None]:
from compression_entropy import read_selected_verses

vocab_size = 10000

filename = "/Users/Moste007/Documents/paralleltext/bibles/corpus/eng-x-bible-standard.txt"
lowercase = True
chosen_books = [40]
truncate_books = False
id_verses, _ = read_selected_verses(filename,
                                                              lowercase,
                                                              chosen_books,
                                                              truncate_books)
verses = id_verses[40]
book_tokenizer = train_tokenizer(verses, vocab_size)

In [None]:
assert has_completed_merges(verses, book_tokenizer)

vocab_size = 10000

bible = eng-x-bible-standard

Merges are completed, and the algorithm was very fast

# Recovering a model from a saved file

In [None]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace

In [None]:
type(book_tokenizer.model)

In [None]:
book_tokenizer.model.save('WordSplitting/output', f'bpe_model_book')

In [None]:
from tokenizers.models import BPE

In [None]:
recovered_tokenizer = Tokenizer(BPE.from_file('WordSplitting/output/bpe_model_book-vocab.json',
 'WordSplitting/output/bpe_model_book-merges.txt'))
recovered_tokenizer.pre_tokenizer = Whitespace()

In [None]:
encode_verses([['i', 'unfinishedly', 'did', 'this']], book_tokenizer)

In [None]:
encode_verses([['i', 'unfinishedly', 'did', 'this']], recovered_tokenizer)

In [None]:
with open('28_sample_en_text.txt') as f:
    sample_en_text = f.read().lower()

In [None]:
sample_en_verse_tokens = [line.split() for line in sample_en_text.split('\n')]

In [None]:
orig_encoded = encode_verses(sample_en_verse_tokens, book_tokenizer)
recovered_encoded = encode_verses(sample_en_verse_tokens, recovered_tokenizer)

In [None]:
assert orig_encoded == recovered_encoded

# Reconstructing the .json

I want to have the tokenizer available at various intermediate steps, to calculate the entropies for different numbers of splits. Probably the easiest way to do this is to drop the tokenizer altogether after we've completed all the merges, using the list of merges provided by the BPE algorithm. Let's try to write this function.

In [None]:
def get_merge_steps(merge_list_file: str) -> list:
    with open(merge_list_file) as f:
        lines = f.readlines()
    assert lines[0].startswith('#') and not lines[1].startswith('#')
    merge_steps = [line.strip().split(' ') for line in lines[1:]]
    for i, line in enumerate(merge_steps):
        if len(line) != 2 or line[0] != line[0].strip() or line[1] != line[1].strip():
            print(i, line, type(line))
            raise ValueError()
    return merge_steps

def split_chars(verse_tokens: list) -> list:
    return [[list(token) for token in tokens] for tokens in verse_tokens]

def apply_merge(verse_token_parts: list, merge_step: list):
    for i, verse in enumerate(verse_token_parts):
        for j in range(len(verse)):
            token = verse[j]
            parts = []
            k = 0
            while k < len(token):
                if k == len(token) - 1:
                    parts.append(token[k])
                    k += 1
                elif token[k] == merge_step[0] and token[k+1] == merge_step[1]:
                    parts.append(token[k] + token[k+1])
                    k += 2
                else:
                    parts.append(token[k])
                    k += 1
            verse[j] = parts
    return verse_token_parts

def encode_from_list(merge_list_file: str, n_merges: int, orig_verse_tokens: list) -> list:
    merge_steps = get_merge_steps(merge_list_file)
    assert n_merges <= len(merge_steps), (n_merges, len(merge_steps))
    verse_token_chars = split_chars(orig_verse_tokens)
    for i in range(n_merges):
        verse_token_chars = apply_merge(verse_token_chars, merge_steps[i])
    return verse_token_chars

In [None]:
' '.join([' '.join(parts) for parts in encode_from_list('WordSplitting/output/bpe_model_book-merges.txt', 1000, sample_en_verse_tokens)[0]])

In [None]:
vocab_size = 1000 + len(set(list(sample_en_text)))

filename = "/Users/Moste007/Documents/paralleltext/bibles/corpus/eng-x-bible-standard.txt"
lowercase = True
chosen_books = [40]
truncate_books = False
id_verses, _ = read_selected_verses(filename,
                                                              lowercase,
                                                              chosen_books,
                                                              truncate_books)
verses = id_verses[40]
book_tokenizer = train_tokenizer(verses, vocab_size)

In [None]:
' '.join(encode_verses(sample_en_verse_tokens, book_tokenizer)[0])

This matches the merges I did "by hand", but furthermore the encoder version ignores non-letter characters. This should be avoided, as we want to include all characters. This affects the training too, so we need to fix that.

## Keeping the BPE tokenizer from removing non-letter characters and capital letters

In [None]:
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.trainers import BpeTrainer

In [None]:
def train_tokenizer_whitespace(verses: list, vocab_size: int) -> Tokenizer:
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = WhitespaceSplit()
    trainer = BpeTrainer(vocab_size=vocab_size)
    tokenizer.train_from_iterator([' '.join(verse) for verse in verses], trainer)
    return tokenizer

In [None]:
book_tokenizer_whitespace = train_tokenizer_whitespace(verses, 100)

In [None]:
' '.join(encode_verses([sample_en_text.split('\n')[0].split(' ')], book_tokenizer_whitespace)[0])

In [None]:
book_tokenizer = train_tokenizer_whitespace(verses, 100)

In [None]:
' '.join(encode_verses([sample_en_text.split('\n')[0].split(' ')], book_tokenizer)[0])

So, the issue does not seem to be that the encoder ignores certain characters, but that out-of-vocabulary characters are removed. This is not a problem for me, so I can ignore it.

# check match btw mi BPE reconstruido y el original

If the above steps are done correctly, then training an encoder for 100 merges should give the same result as using my merger.

In [None]:
vocab_size = 100 + len(set(list(sample_en_text)))

filename = "/Users/Moste007/Documents/paralleltext/bibles/corpus/eng-x-bible-standard.txt"
lowercase = True
chosen_books = [40]
truncate_books = False
id_verses, _ = read_selected_verses(filename,
                                                              lowercase,
                                                              chosen_books,
                                                              truncate_books)
verses = id_verses[40]
book_tokenizer = train_tokenizer(verses, vocab_size)

In [None]:
tokenizer_reconstructed_verses = [' '.join(encoded_verse_tokens) for encoded_verse_tokens in encode_verses(verses, book_tokenizer)]

In [None]:
book_tokenizer.model.save('WordSplitting/output', f'bpe_model_book_100')

In [None]:
hand_reconstructed_verses = [' '.join([' '.join(token) for token in verse]) for verse in encode_from_list('WordSplitting/output/bpe_model_book_100-merges.txt', 100, verses)]

In [None]:
len(tokenizer_reconstructed_verses) == len(hand_reconstructed_verses)

In [None]:
for i in range(len(tokenizer_reconstructed_verses)):
    if tokenizer_reconstructed_verses[i] != hand_reconstructed_verses[i]:
        print(i)
        break

In [None]:
tokenizer_reconstructed_verses[6]

In [None]:
hand_reconstructed_verses[6]

The merge (h,o) -> ho has occurred in the original tokenizer, but not in the reconstructed one. Still, by looking at the file, I can see that it was about to take place, so this is a minor error and it can be ignored. And, if anything, I trust my hand reconstruction more.

Two more checks: a Chinese bible, and a longer reconstruction history.

## Chinese bible

In [None]:
N_MERGES = 100

filename = "/Users/Moste007/Documents/paralleltext/bibles/corpus/zho-x-bible-contemp.txt"

with open(filename) as f:
    file_text = f.read()

lowercase = True

if lowercase:
    file_text = file_text.lower()

vocab_size = N_MERGES + len(set(list(file_text)))
chosen_books = [40]
truncate_books = False
id_verses, _ = read_selected_verses(filename,
                                                              lowercase,
                                                              chosen_books,
                                                              truncate_books)
verses = id_verses[40]
book_tokenizer = train_tokenizer(verses, vocab_size)

In [None]:
tokenizer_reconstructed_verses = [' '.join(encoded_verse_tokens) for encoded_verse_tokens in encode_verses(verses, book_tokenizer)]

In [None]:
book_tokenizer.model.save('WordSplitting/output', f'bpe_model_book_{N_MERGES}_zho')

In [None]:
hand_reconstructed_verses = [' '.join([' '.join(token) for token in verse]) for verse in encode_from_list(f'WordSplitting/output/bpe_model_book_{N_MERGES}_zho-merges.txt', 
                                                                                                          N_MERGES, verses)]

In [None]:
len(tokenizer_reconstructed_verses) == len(hand_reconstructed_verses)

In [None]:
hand_reconstructed_verses

In [None]:
' '.join(verses[0])

## Longer reconstruction history

My code is a bit slow. What would happen if I wanted to reconstruct an entire history?

In [None]:
N_MERGES = 10000

filename = "/Users/Moste007/Documents/paralleltext/bibles/corpus/eng-x-bible-standard.txt"

with open(filename) as f:
    file_text = f.read()

lowercase = True

if lowercase:
    file_text = file_text.lower()

vocab_size = N_MERGES + len(set(list(file_text)))
chosen_books = [40]
truncate_books = False
id_verses, _ = read_selected_verses(filename,
                                                              lowercase,
                                                              chosen_books,
                                                              truncate_books)
verses = id_verses[40]
book_tokenizer = train_tokenizer(verses, vocab_size)

In [None]:
tokenizer_reconstructed_verses = [' '.join(encoded_verse_tokens) for encoded_verse_tokens in encode_verses(verses, book_tokenizer)]

In [None]:
book_tokenizer.model.save('WordSplitting/output', f'bpe_model_book_{N_MERGES}')

In [None]:
with open(f'WordSplitting/output/bpe_model_book_{N_MERGES}-merges.txt') as f:
    total_merges = len(f.readlines()) - 1

In [None]:
hand_reconstructed_verses = [' '.join([' '.join(token) for token in verse]) for verse in encode_from_list(f'WordSplitting/output/bpe_model_book_{N_MERGES}-merges.txt', 
                                                                                                          total_merges, verses)]

In [None]:
len(tokenizer_reconstructed_verses) == len(hand_reconstructed_verses)

Still, this only took a few seconds, which is reasonable.

# Retrieving the merges directly from the model

This way I would not have to save to a file and read it again

In [None]:
book_tokenizer.model.get_trainer()

# Do a whole round manually for testing purposes

In [None]:
from collections import defaultdict

In [None]:
verses = [['ТІаккха', 'Везачу', 'Дала', 'зудчуьнга', 'элира', ':', '«', 'И', 'хІун', 'ду', 'ахь', 'динарг', '?', '»', 'Зудчо', 'жоп', 'делира', 'Цуьнга', ':', '«', 'Лаьхьано', ',', 'хІилла', 'а', 'дина', ',', 'Іехийра', 'со', ',', 'ткъа', 'аса', 'и', 'стоьмаш', 'диира', '»', ',', '—', 'аьлла', '.]'],
['ТІаккха', 'Везачу', 'Дала', 'лаьхьане', 'элира', ':', '«', 'И', 'вон', 'ахь', 'дарна', ',', 'хьуна', 'а', 'хир', 'ду', 'вон', '.', 'НеІалт', 'кхайкхадо', 'хьуна', 'массо', 'а', 'даьхнина', 'а', ',', 'ерриге', 'а', 'аренан', 'акхарошна', 'а', 'хьалха', '.', 'ХІинца', 'дуьйна', 'хьо', 'баллалц', 'текхар', 'бу', 'хьо', 'гай', 'тІехь', ',', 'чан', 'а', 'юуш', '.']]

In [None]:
verse_parts = [[list(token) for token in verse] for verse in verses]

In [None]:
current_values = {}

In [None]:
def get_frequencies(seq_token_sub_tokens: list) -> dict:
    frequencies = defaultdict(int)
    for seq in seq_token_sub_tokens:
        for token in seq:
            for i, sub_token in enumerate(token):
                if i == len(token) - 1:
                    continue
                frequencies[(token[i], token[i+1])] += 1
    return frequencies

In [None]:
def get_most_frequent_pair(pair_counts: dict) -> tuple:
    max_counts = 0
    max_pair = (None, None)
    for pair, counts in pair_counts.items():
        if counts > max_counts:
            max_pair = pair
            max_counts = counts
    return max_pair

In [None]:
def update_parts(seq_token_sub_tokens: list, next_merge: tuple) -> list:
    for seq in seq_token_sub_tokens:
        for j in range(len(seq)):
            token = seq[j]
            sub_tokens = []
            i = 0
            while i < len(token):
                if i < len(token) - 1 and token[i] == next_merge[0] and token[i+1] == next_merge[1]:
                    sub_tokens.append(token[i] + token[i+1])
                    i += 2
                else:
                    sub_tokens.append(token[i])
                    i += 1
            seq[j] = sub_tokens
    return seq_token_sub_tokens

In [None]:
def has_completed_all_merges(seq_token_sub_tokens: list) -> bool:
    return all([all([len(token) == 1 for token in seq]) for seq in seq_token_sub_tokens])

In [None]:
n_steps = 10000
merge_steps = []
for i in range(n_steps):
    current_values = get_frequencies(verse_parts)
    next_merge = get_most_frequent_pair(current_values)
    merge_steps.append(next_merge)
    verse_parts = update_parts(verse_parts, next_merge)
    if has_completed_all_merges(verse_parts):
        break

In [None]:
more_verses = [seq.split(' ') for seq in """Apama cellere dwon , oyido malo , doge tye apar wie aryo . Omalaika apar wie aryo tye i dogcel , i dogcel daŋ ocoo iye nyiŋ atekere apar wie aryo li jo Icrael .
Dogcel adek obedo tuŋ kide , dogcel adek obedo tuŋ anyarodi , dogcel adek obedo tuŋ anyarolum , dogcel adek obedo tuŋ to .""".split('\n')]

In [None]:
more_verse_parts = [[list(token) for token in verse] for verse in more_verses]

In [None]:
print(more_verses)

In [None]:
n_steps = 10000
more_merge_steps = []
for i in range(n_steps):
    current_values = get_frequencies(more_verse_parts)
    next_merge = get_most_frequent_pair(current_values)
    more_merge_steps.append(next_merge)
    more_verse_parts = update_parts(more_verse_parts, next_merge)
    if has_completed_all_merges(more_verse_parts):
        break

In [None]:
verses = [['ТІаккха', 'Везачу', 'Дала', 'зудчуьнга', 'элира', ':', '«', 'И', 'хІун', 'ду', 'ахь', 'динарг', '?', '»', 'Зудчо', 'жоп', 'делира', 'Цуьнга', ':', '«', 'Лаьхьано', ',', 'хІилла', 'а', 'дина', ',', 'Іехийра', 'со', ',', 'ткъа', 'аса', 'и', 'стоьмаш', 'диира', '»', ',', '—', 'аьлла', '.]'],
['ТІаккха', 'Везачу', 'Дала', 'лаьхьане', 'элира', ':', '«', 'И', 'вон', 'ахь', 'дарна', ',', 'хьуна', 'а', 'хир', 'ду', 'вон', '.', 'НеІалт', 'кхайкхадо', 'хьуна', 'массо', 'а', 'даьхнина', 'а', ',', 'ерриге', 'а', 'аренан', 'акхарошна', 'а', 'хьалха', '.', 'ХІинца', 'дуьйна', 'хьо', 'баллалц', 'текхар', 'бу', 'хьо', 'гай', 'тІехь', ',', 'чан', 'а', 'юуш', '.']]
verse_parts = [[list(token) for token in verse] for verse in verses]
for i in range(68):
    verse_parts = apply_merge(verse_parts, merge_steps[i])

In [None]:
print(verse_parts)

In [None]:
merge_steps[:10]