In [None]:
import evaluate 
import tqdm
import torch
import numpy as np
from utils import MAX_TARGET_LENGTH

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def test_loop(dataloader, model, tokenizer):
    model.eval()
    bleu = evaluate.load("bleu")
    with tqdm(total=len(dataloader)) as pbar:
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(dataloader, start=1):
                batch_data = batch_data.to(device)
                outputs = model.generate(
                    batch_data["input_ids"],
                    attention_mask=batch_data["attention_mask"],
                    max_new_token=MAX_TARGET_LENGTH,
                    num_beams=4
                    )
                decoded_outputs = tokenizer.batch_decode(
                    outputs,
                    skip_special_tokens=True
                    )
                labels = batch_data['labels']
                labels = torch.where(labels != -100, labels, tokenizer.pad_token_id)
                decoded_labels = tokenizer.batch_decode(
                    labels,
                    skip_special_tokens=True
                )

                preds = [' '.join(pred.strip()) for pred in decoded_outputs]
                labels = [' '.join(label.strip()) for label in decoded_labels]
            bleu_result = bleu.compute(predictions=preds, references=labels)
            result = {f"bleu-{i}" : value for i, value in enumerate(bleu_result["precisions"], start=1)}
            result['avg'] = np.mean(result.values())
            print(f"Test result: BLEU1={result["bleu-1"]}, BLEU2={result["bleu-2"]}, BLEU3={result["bleu-3"]}, BLEU4={result["bleu-4"]}")
            return result