In [1]:
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch
from tqdm.auto import tqdm
from evaluate import load  # Hugging Face’s metrics hub

import gpn.model
from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling
from pathlib import Path
import os

In [23]:
# dataset_name = "sbuedenb/small_beetle_dataset"
# model_name   = "sbuedenb/beetle-gpn" # v1: Top-1 accuracy: 51.8759% (validation) , v2: 53.0181%
# model_name   = "sbuedenb/beetle-gpn-wide" # Top-1 accuracy: 53.3793% (validation)


# model_name   = "sbuedenb/beetle-gpn-wide-reduced" # Top-1 accuracy: 51.8314%
model_name   = "/home/sbuedenb/models/long-wide-cosine/"
dataset_name = "sbuedenb/big_beetle_dataset"
# model_name   = "songlab/gpn-brassicales"
# (on brassicales) Top-1 accuracy: 53.8563% (validation), Top-1 accuracy: 53.2370% (test)
# (on cucujiformia) Top-1 accuracy: 42.8384%

# dataset_name = "songlab/genomes-brassicales-balanced-v1"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForMaskedLM.from_pretrained(model_name, local_files_only=True).eval()
dataset   = load_dataset(dataset_name, split="validation")   # or "validation"

dilation_schedule=[1, 3, 9, 27, 81, 243, 1, 3, 9, 27, 81, 243, 1, 3, 9, 27, 81, 243]


In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device);
device

'cuda'

# Top-1 accuracy on sbuedenb/big_beetle_dataset
Model | Accuracy (eval) | Accuracy (test)
-|-|-
songlab/gpn-brassicales| 42.7848% | 42.9517%
sbuedenb/beetle-gpn | 51.4824% | 56.0279%
sbuedenb/beetle-gpn-wide-reduced | **51.8868%** | **56.2513%**
sbuedenb/long-wide-cosine | 52.01% +- 0.63% | 55.70 +- 1.26%

In [25]:
def tokenize_function(batch):
    res = tokenizer(
        batch["seq"],
        return_special_tokens_mask=True,
        padding=False,
        truncation=False,
        return_token_type_ids=False,
    )
    return res

tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["seq", "assembly", "chrom", "strand"])

In [26]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,   # standard BERT mask-ratio
    seed=42,
)

In [27]:
loader = DataLoader(
    tokenized,
    batch_size=256,
    shuffle=False,
    collate_fn=data_collator,
)

In [28]:
accuracy = load("accuracy")

total_loss   = 0.0        # sum of per-token loss
total_tokens = 0          # number of tokens that contributed

loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="sum")

for batch in tqdm(loader, desc="evaluating"):
    input_ids  = batch["input_ids"].to(device)
    labels     = batch["labels"].to(device)         # -100 where no mask

    with torch.no_grad():
        logits = model(input_ids=input_ids).logits

    preds = logits.argmax(dim=-1)

    # Select only the masked positions
    mask = labels != -100
    accuracy.add_batch(        
        predictions=preds[mask],
        references=labels[mask],
    )

    # flatten so CrossEntropyLoss sees (N_tokens, vocab)
    loss = loss_fct(
        logits.view(-1, logits.size(-1)),
        labels.view(-1)
    )

    total_loss   += loss.item()
    total_tokens += mask.sum().item()        # number of real tokens in this batch

avg_loss = total_loss / total_tokens         # per-token loss
top1 = accuracy.compute()

print(f"Top-1 accuracy: {top1['accuracy']:.2%} | Avg loss: {avg_loss:.2f}")

evaluating:   0%|          | 0/166 [00:00<?, ?it/s]

Top-1 accuracy: 52.01% | Avg loss: 1.08


In [18]:
top1

{'accuracy': 0.5569858133325299}

In [19]:
len(dataset)

10381

In [20]:
import math

# 1.64 (90%)
# 1.96 (95%)
# 2.33 (98%)
# 2.58 (99%)
z = 2.58
n = len(dataset)
acc = top1['accuracy']

In [22]:
interval = z * math.sqrt( (acc * (1 - acc)) / n)
interval

print(f"99% confidence interval: {interval:.2%}")

99% confidence interval: 1.26%
