In [3]:
%pip install numpy numba tqdm pandas
%pip install --no-cache-dir --force-reinstall https://dm.cs.tu-dortmund.de/nats/nats25_02_02_bpe-0.1-py3-none-any.whl
import nats25_02_02_bpe

Note: you may need to restart the kernel to use updated packages.
Collecting nats25-02-02-bpe==0.1
  Downloading https://dm.cs.tu-dortmund.de/nats/nats25_02_02_bpe-0.1-py3-none-any.whl (2.4 kB)
Installing collected packages: nats25-02-02-bpe
  Attempting uninstall: nats25-02-02-bpe
    Found existing installation: nats25_02_02_bpe 0.1
    Uninstalling nats25_02_02_bpe-0.1:
      Successfully uninstalled nats25_02_02_bpe-0.1
Successfully installed nats25-02-02-bpe-0.1
Note: you may need to restart the kernel to use updated packages.


# Byte-Pair Encoding

In this assignment, your task is to implement the training of a byte-pair-encoding tokenizer yourself.

In [4]:
import numpy as np, pandas as pd, re
from numba import jit
try: from tqdm.notebook import tqdm # optional
except: tqdm = None

# Load the input data
import gzip, json, urllib
file_path, _ = urllib.request.urlretrieve("https://dm.cs.tu-dortmund.de/nats/data/minecraft-articles.json.gz")
raw = json.load(gzip.open(file_path, "rt", encoding="utf-8"))
titles, texts, classes = [x["title"] for x in raw], [x["text"] for x in raw], [x["heuristic"] for x in raw]

## Join texts into a single sequence of bytes.

Split all the provided texts (`title` and `text`) using the given whitespace pretokenizer. Encode the tokens as bytes with UTF-8.

In [5]:
#in this we tokenized the titles and texts into one singular data array where each element is a byte represenation of each token 
pretokenizer=re.compile(r"\n|\s*\S+")
data = []
for title in titles:
    split_text = pretokenizer.findall(string=title)
    for elm in split_text:
        data.append(elm)

for text in texts: 
    split_text = pretokenizer.findall(string=text)
    for elm in split_text:
        data.append(elm)
for index in range(0,len(data)): 
    data[index] = data[index].encode(encoding="utf-8")
print(len(data))

714539


In [6]:
nats25_02_02_bpe.hidden_tests_4_0(data)

In [7]:
# In the following, we will use lists containing numpy arrays with int16
data = np.array([int(x) for x in b"\0".join(data)], dtype=np.int16)
print(data.shape)

(4649344,)


In [8]:
nats25_02_02_bpe.hidden_tests_6_0(data)

## Write a function to find the most common two symbols in a sequence

While this will be the performance bottleneck of the implementation, you may use a `Counter` of pairs here.

In our experiments, a vectorized numpy solution was 60x faster.

- Return a pair of ints (we *will* exceed the byte range).
- Skip 0 tokens used as separators
- The second token must not be a space or newline ("pre-tokenization")
- When no token occurs more than once, return None

In [9]:
from collections import Counter
def find_most_frequent(seq):
    if len(seq) < 2:
        return None

    # Create a 2D array of all adjacent pairs
    pairs = np.stack([seq[:-1], seq[1:]], axis=1)

    # Define a mask to filter out invalid pairs
    # 1. The first token cannot be the separator (0).
    # 2. The second token cannot be a space (32) or a newline (10).
    mask = (seq[:-1] != 0) & (seq[1:] != 32) & (seq[1:] != 10)

    # Apply the mask to get only valid pairs
    valid_pairs = pairs[mask]

    if valid_pairs.shape[0] == 0:
        return None

    # Find unique pairs and their counts
    unique_pairs, counts = np.unique(valid_pairs, axis=0, return_counts=True)

    # If no pair occurs more than once, return None
    max_count = counts.max()
    if max_count <= 1:
        return None

    # Find and return the most frequent pair
    most_frequent_pair = unique_pairs[np.argmax(counts)]
    return tuple(most_frequent_pair.astype(int))

In [10]:
nats25_02_02_bpe.hidden_tests_9_0(find_most_frequent, data)

AssertionError: Function should return a tuple of np.int16

## Initialize the vocabulary

Our initial vocabulary contains all 256 bytes, so we can later still encode any character (or byte sequence) not in our training data.
The vocabulary is used for decoding, so it is a map from integer token ids to bytes.

In [11]:
def init_vocab():
    vocab = dict() # int to bytes
    pass # Your solution here
    return vocab

In [12]:
nats25_02_02_bpe.hidden_tests_12_0(init_vocab)

AssertionError: Vocabulary does not have 256 entries

## Token replacement function

In the given sequence, replace tokens (a,b) with a new token c. Avoid copying, but modify the sequence in-place. You can use `numba.jit` to make this (much) faster.

Return the resulting array (-view).

In [13]:
def replace(seq, a, b, c):
    pass # Your solution here

In [14]:
nats25_02_02_bpe.hidden_tests_15_0(replace)

TypeError: object of type 'NoneType' has no len()

## Train BPE

Implement a function to train a byte-pair encoding.

In [15]:
def train_bpe(indata, size=1000):
    merges = list() # of tuples(id1, id2)
    vocab = init_vocab()
    data = np.array(indata, dtype=np.int16) # copy to allow modifications
    pbar = tqdm(total=size-256) if tqdm else None # optional

    pass # Your solution here

    if pbar: pbar.close() # finish progressbar
    print("Compression factor:", len(data) / len(indata))
    return vocab, merges

In [16]:
nats25_02_02_bpe.hidden_tests_18_0(train_bpe)

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

## Train a tokenizer on our training data

Inspect the longest tokens generated.

In [17]:
%%time
vocab, merges = train_bpe(data, 1024) # begin with 512 – at 1024, we get many more words as standalone tokens, but the runtime increases

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [18]:
nats25_02_02_bpe.hidden_tests_21_0(vocab, merges)

NameError: name 'vocab' is not defined

## Tokenization function

Implement a function to tokenize a string given the vocabulary and merges.

While not the most efficient, it is fine to implement this using `replace` above. To improve performance, call `replace` only when necessary.

In [None]:
def tokenize(merges, s):
    tokens = None # np.array of int16 as above
    pass # Your solution here
    return tokens

In [None]:
nats25_02_02_bpe.hidden_tests_24_0(vocab, merges, tokenize)

## Decoding function

Implement a function to decode a token sequence into a regular string.

In [None]:
def decode(vocab, tokens):
    s = None
    pass # Your solution here
    return s

In [None]:
nats25_02_02_bpe.hidden_tests_27_0(vocab, merges, tokenize, decode)