In [34]:
import datasets
import os

ag_news_dataset = datasets.load_dataset("ag_news", split="test")
dataset_corrupted_letters = datasets.load_from_disk(os.path.join("ag_news_variations", "ag_news_corrupted_letters_test"))
dataset_corrupted = datasets.load_from_disk(os.path.join("ag_news_variations", "ag_news_corrupted_test"))
dataset_danish = datasets.load_from_disk(os.path.join("ag_news_variations", "ag_news_translated_da_test"))
dataset_icelandic = datasets.load_from_disk(os.path.join("ag_news_variations", "ag_news_translated_is_test"))


In [35]:
from transformers import PreTrainedTokenizerFast

tokenizer_path = "tokenizers"

char_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=os.path.join(tokenizer_path, "char_tokenizer.json"),
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

byte_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=os.path.join(tokenizer_path, "byte_tokenizer.json"),
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

raw_byte_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=os.path.join(tokenizer_path, "raw_byte_tokenizer.json"),
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)



In [37]:
# tokenize dataset
def tokenize_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["text"])

    tokenized_dataset = dataset.map(tokenize_function)
    return tokenized_dataset

for dataset in [
    ("dataset_original", ag_news_dataset),
    ("dataset_corrupted_letters", dataset_corrupted_letters),
    ("dataset_corrupted", dataset_corrupted),
    ("dataset_danish", dataset_danish),
    ("dataset_icelandic", dataset_icelandic),
]:
    for tokenizer in [("char_tokenizer", char_tokenizer), ("byte_tokenizer", byte_tokenizer), ("raw_byte_tokenizer", raw_byte_tokenizer)]:
        tokenized_dataset = tokenize_dataset(dataset[1], tokenizer[1])
        # print some stats
        print(
            f"Dataset: {dataset[0]}, Tokenizer: {tokenizer[0]}, Avg length: {sum(len(x) for x in tokenized_dataset['input_ids']) / len(tokenized_dataset)}, OOV token ratio: {sum(x.count(tokenizer[1].unk_token_id) for x in tokenized_dataset['input_ids']) / sum(len(x) for x in tokenized_dataset['input_ids'])}"
        )

Dataset: dataset_original, Tokenizer: char_tokenizer, Avg length: 50.331315789473685, OOV token ratio: 0.0
Dataset: dataset_original, Tokenizer: byte_tokenizer, Avg length: 57.45828947368421, OOV token ratio: 0.0
Dataset: dataset_original, Tokenizer: raw_byte_tokenizer, Avg length: 235.2992105263158, OOV token ratio: 0.0
Dataset: dataset_corrupted_letters, Tokenizer: char_tokenizer, Avg length: 69.97105263157894, OOV token ratio: 0.0
Dataset: dataset_corrupted_letters, Tokenizer: byte_tokenizer, Avg length: 75.14868421052631, OOV token ratio: 0.0
Dataset: dataset_corrupted_letters, Tokenizer: raw_byte_tokenizer, Avg length: 235.2992105263158, OOV token ratio: 0.0
Dataset: dataset_corrupted, Tokenizer: char_tokenizer, Avg length: 72.98210526315789, OOV token ratio: 0.10995124976562387
Dataset: dataset_corrupted, Tokenizer: byte_tokenizer, Avg length: 84.35315789473684, OOV token ratio: 0.0
Dataset: dataset_corrupted, Tokenizer: raw_byte_tokenizer, Avg length: 241.20986842105262, OOV tok