In [None]:
import os
import torch
from tqdm import tqdm
from datasets import load_from_disk
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerFast, DataCollatorWithPadding
from transformer import TransformerEncoderModel, TransformerClassifierModel

# Evaluate models

In [9]:
n_classes = 4
batch_size = 64
ag_news_folder = "./data/ag_news"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def evaluate_model(tokenizer, classifier_model):
    test_sets = ["ag_news_original_test","ag_news_translated_da_test","ag_news_translated_is_test","ag_news_corrupted_test", "ag_news_corrupted_letters_test"]

    def tokenize(batch):
        return tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True, max_length=512)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

    for test_set in test_sets:
        dataset = load_from_disk(os.path.join(ag_news_folder, test_set))

        correct = 0
        total = 0

        test_tokenized = dataset.map(tokenize, batched=True, batch_size=batch_size, remove_columns=["text"])
        test_loader = DataLoader(test_tokenized, batch_size=batch_size, collate_fn=data_collator)
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Testing"):
                inputs = batch['input_ids'].to(device)
                labels = batch['labels'].to(device)

                outputs = classifier_model(inputs)
                _, predicted = torch.max(outputs, 1)

                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        print(f"Test Accuracy: {accuracy*100:.2f}% - evaluated on [{test_set}]")


def load_tokenizer(path):
    hf_tokenizer = PreTrainedTokenizerFast(
        tokenizer_file=path,
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
    )
    return hf_tokenizer

def load_model(path, vocab_size, d_model, tokenizer):

    # Recreate model architecture

    encoder = TransformerEncoderModel(
        num_embeddings=vocab_size,
        d_model=d_model,
        padding_idx=tokenizer.pad_token_id,
        nhead=8,
        dim_feedforward=4*d_model,
        num_layers=4
    )
    classifier_model = TransformerClassifierModel(encoder=encoder, n_classes=n_classes)

    # Load the saved weights
    classifier_model.load_state_dict(torch.load(path, map_location=device))
    classifier_model.to(device)
    classifier_model.eval()
    return classifier_model

## CharBPE

In [10]:
tokenizer_path = "./tokenizers/char_tokenizer.json"
tokenizer = load_tokenizer(tokenizer_path)

model_path = "./models/char_model.pth"
classifier_model = load_model(model_path, vocab_size=10_000, d_model=128, tokenizer=tokenizer)

evaluate_model(tokenizer, classifier_model)

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

  output = torch._nested_tensor_from_mask(
Testing: 100%|██████████| 119/119 [00:17<00:00,  6.86it/s]

Test Accuracy: 88.22% - evaluated on [ag_news_original_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:36<00:00,  3.29it/s]

Test Accuracy: 46.95% - evaluated on [ag_news_translated_da_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:48<00:00,  2.46it/s]

Test Accuracy: 37.88% - evaluated on [ag_news_translated_is_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:25<00:00,  4.58it/s]

Test Accuracy: 79.84% - evaluated on [ag_news_corrupted_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:24<00:00,  4.88it/s]

Test Accuracy: 81.43% - evaluated on [ag_news_corrupted_letters_test]





## ByteBPE

In [11]:
tokenizer_path = "./tokenizers/byte_tokenizer.json"
tokenizer = load_tokenizer(tokenizer_path)

model_path = "./models/byte_model.pth"
classifier_model = load_model(model_path, vocab_size=10_000, d_model=128, tokenizer=tokenizer)

evaluate_model(tokenizer, classifier_model)

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:21<00:00,  5.60it/s]

Test Accuracy: 86.76% - evaluated on [ag_news_original_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:40<00:00,  2.92it/s]

Test Accuracy: 43.22% - evaluated on [ag_news_translated_da_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [01:06<00:00,  1.78it/s]

Test Accuracy: 43.20% - evaluated on [ag_news_translated_is_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:34<00:00,  3.43it/s]

Test Accuracy: 76.41% - evaluated on [ag_news_corrupted_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [00:28<00:00,  4.18it/s]

Test Accuracy: 77.14% - evaluated on [ag_news_corrupted_letters_test]





## RawSmall

In [12]:
tokenizer_path = "./tokenizers/raw_byte_tokenizer.json"
tokenizer = load_tokenizer(tokenizer_path)

model_path = "./models/raw_model.pth"
classifier_model = load_model(model_path, vocab_size=259, d_model=128, tokenizer=tokenizer)

evaluate_model(tokenizer, classifier_model)

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [02:25<00:00,  1.23s/it]

Test Accuracy: 90.72% - evaluated on [ag_news_original_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [02:28<00:00,  1.25s/it]

Test Accuracy: 81.84% - evaluated on [ag_news_translated_da_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [02:18<00:00,  1.16s/it]

Test Accuracy: 61.71% - evaluated on [ag_news_translated_is_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [02:20<00:00,  1.18s/it]

Test Accuracy: 89.14% - evaluated on [ag_news_corrupted_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [02:33<00:00,  1.29s/it]

Test Accuracy: 89.33% - evaluated on [ag_news_corrupted_letters_test]





## RawLarge

In [13]:
tokenizer_path = "./tokenizers/raw_byte_tokenizer.json"
tokenizer = load_tokenizer(tokenizer_path)

model_path = "./models/raw_256_model.pth"
classifier_model = load_model(model_path, vocab_size=259, d_model=256, tokenizer=tokenizer)

evaluate_model(tokenizer, classifier_model)

Testing: 100%|██████████| 119/119 [03:39<00:00,  1.84s/it]

Test Accuracy: 91.32% - evaluated on [ag_news_original_test]





Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Testing: 100%|██████████| 119/119 [03:39<00:00,  1.85s/it]


Test Accuracy: 80.24% - evaluated on [ag_news_translated_da_test]


Testing: 100%|██████████| 119/119 [03:49<00:00,  1.93s/it]


Test Accuracy: 61.57% - evaluated on [ag_news_translated_is_test]


Testing: 100%|██████████| 119/119 [03:51<00:00,  1.95s/it]


Test Accuracy: 89.14% - evaluated on [ag_news_corrupted_test]


Testing: 100%|██████████| 119/119 [03:38<00:00,  1.84s/it]

Test Accuracy: 89.13% - evaluated on [ag_news_corrupted_letters_test]



