In [None]:
%pip install numpy numba tqdm pandas
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_02_02_bpe-0.1-py3-none-any.whl
import nats25_02_02_bpe

# Byte-Pair Encoding

In this assignment, your task is to implement the training of a byte-pair-encoding tokenizer yourself.

In [None]:
import numpy as np, pandas as pd, re
from numba import jit
try: from tqdm.notebook import tqdm # optional
except: tqdm = None

# Load the input data
import gzip, json, urllib
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/minecraft-articles.json.gz")
raw = json.load(gzip.open(file_path, "rt", encoding="utf-8"))
titles, texts, classes = [x["title"] for x in raw], [x["text"] for x in raw], [x["heuristic"] for x in raw]

## Join texts into a single sequence of bytes.

Split all the provided texts (`title` and `text`) using the given whitespace pretokenizer. Encode the tokens as bytes with UTF-8.

In [None]:
pretokenizer=re.compile(r"\n|\s*\S+")
data = None # concatenated data
pass # Your solution here

In [None]:
nats25_02_02_bpe.hidden_tests_4_0(data)

In [None]:
# In the following, we will use lists containing numpy arrays with int16
data = np.array([int(x) for x in b"\0".join(data)], dtype=np.int16)
print(data.shape)

In [None]:
nats25_02_02_bpe.hidden_tests_6_0(data)

## Write a function to find the most common two symbols in a sequence

While this will be the performance bottleneck of the implementation, you may use a `Counter` of pairs here.

In our experiments, a vectorized numpy solution was 60x faster.

- Return a pair of ints (we *will* exceed the byte range).
- Skip 0 tokens used as separators
- The second token must not be a space or newline ("pre-tokenization")
- When no token occurs more than once, return None

In [None]:
from collections import Counter
def find_most_frequent(seq):
    pass # Your solution here

In [None]:
nats25_02_02_bpe.hidden_tests_9_0(find_most_frequent, data)

## Initialize the vocabulary

Our initial vocabulary contains all 256 bytes, so we can later still encode any character (or byte sequence) not in our training data.
The vocabulary is used for decoding, so it is a map from integer token ids to bytes.

In [None]:
def init_vocab():
    vocab = dict() # int to bytes
    pass # Your solution here
    return vocab

In [None]:
nats25_02_02_bpe.hidden_tests_12_0(init_vocab)

## Token replacement function

In the given sequence, replace tokens (a,b) with a new token c. Avoid copying, but modify the sequence in-place. You can use `numba.jit` to make this (much) faster.

Return the resulting array (-view).

In [None]:
def replace(seq, a, b, c):
    pass # Your solution here

In [None]:
nats25_02_02_bpe.hidden_tests_15_0(replace)

## Train BPE

Implement a function to train a byte-pair encoding.

In [None]:
def train_bpe(indata, size=1000):
    merges = list() # of tuples(id1, id2)
    vocab = init_vocab()
    data = np.array(indata, dtype=np.int16) # copy to allow modifications
    pbar = tqdm(total=size-256) if tqdm else None # optional

    pass # Your solution here

    if pbar: pbar.close() # finish progressbar
    print("Compression factor:", len(data) / len(indata))
    return vocab, merges

In [None]:
nats25_02_02_bpe.hidden_tests_18_0(train_bpe)

## Train a tokenizer on our training data

Inspect the longest tokens generated.

In [None]:
%%time
vocab, merges = train_bpe(data, 1024) # begin with 512 – at 1024, we get many more words as standalone tokens, but the runtime increases

In [None]:
nats25_02_02_bpe.hidden_tests_21_0(vocab, merges)

## Tokenization function

Implement a function to tokenize a string given the vocabulary and merges.

While not the most efficient, it is fine to implement this using `replace` above. To improve performance, call `replace` only when necessary.

In [None]:
def tokenize(merges, s):
    tokens = None # np.array of int16 as above
    pass # Your solution here
    return tokens

In [None]:
nats25_02_02_bpe.hidden_tests_24_0(vocab, merges, tokenize)

## Decoding function

Implement a function to decode a token sequence into a regular string.

In [None]:
def decode(vocab, tokens):
    s = None
    pass # Your solution here
    return s

In [None]:
nats25_02_02_bpe.hidden_tests_27_0(vocab, merges, tokenize, decode)