In [None]:
import re
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from donut import JSONParseEvaluator
from datasets import load_dataset

device = "cuda" if torch.cuda.is_available() else "cpu"

from config import model_name, model_name_base
from transformers import DonutProcessor, VisionEncoderDecoderModel


In [None]:

processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name)

In [None]:

model.eval()
model.to(device)


f1 = []
accs = []
recalls = []
precisions = []

dataset = load_dataset("Rajan/AIMT-invoices-donut-data", split="test")


for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # prepare encoder inputs
    pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    # prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
    decoder_input_ids = decoder_input_ids.to(device)

    # autoregressively generate sequence
    outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )

    # turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    seq = processor.token2json(seq)


    ground_truth = json.loads(sample["ground_truth"])
    true_label = ground_truth["gt_parse"]


    ground_truth = ground_truth["gt_parse"]
    evaluator = JSONParseEvaluator()
    score = evaluator.cal_acc(seq, ground_truth)
    f1_value = evaluator.cal_f1(seq,ground_truth)





    total_tp, total_fp, total_fn = 0, 0, 0
    for pred, answer in zip(seq, ground_truth):
        pred, answer = evaluator.flatten(evaluator.normalize_dict(pred)), evaluator.flatten(evaluator.normalize_dict(answer))
        answer_set = set(answer)
        pred_set = set(pred)

        tp = len(pred_set & answer_set)
        fp = len(pred_set - answer_set)
        fn = len(answer_set - pred_set)

        total_tp += tp
        total_fp += fp
        total_fn += fn

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0


    accs.append(score)
    f1.append(f1_value)
    recalls.append(recall)
    precisions.append(precision)

    
scores_our = {"mean_f1": np.mean(f1), "mean_accuracy": np.mean(accs), "recall": np.mean(recalls), "precisions": np.mean(precisions)}
# scores_our.update({n: 3 * scores_our[n] for n in scores_our.keys()})
scores_our.update({n: 100 * scores_our[n] for n in scores_our.keys()})



In [None]:

processor = DonutProcessor.from_pretrained(model_name_base)
model = VisionEncoderDecoderModel.from_pretrained(model_name_base)

In [None]:

import gc
torch.cuda.empty_cache()
gc.collect()

model.eval()
model.to(device)


f1 = []
accs = []
recalls = []
precisions = []


for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # prepare encoder inputs
    pixel_values = processor(sample["image"].convert("RGB"), return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    # prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
    decoder_input_ids = decoder_input_ids.to(device)

    # autoregressively generate sequence
    outputs = model.generate(
            pixel_values,
            decoder_input_ids=decoder_input_ids,
            max_length=model.decoder.config.max_position_embeddings,
            early_stopping=True,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id,
            use_cache=True,
            num_beams=1,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
        )

    # turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
    seq = processor.token2json(seq)


    ground_truth = json.loads(sample["ground_truth"])
    true_label = ground_truth["gt_parse"]


    ground_truth = ground_truth["gt_parse"]
    evaluator = JSONParseEvaluator()
    score = evaluator.cal_acc(seq, ground_truth)
    f1_value = evaluator.cal_f1(seq,ground_truth)





    total_tp, total_fp, total_fn = 0, 0, 0
    for pred, answer in zip(seq, ground_truth):
        pred, answer = evaluator.flatten(evaluator.normalize_dict(pred)), evaluator.flatten(evaluator.normalize_dict(answer))
        answer_set = set(answer)
        pred_set = set(pred)

        tp = len(pred_set & answer_set)
        fp = len(pred_set - answer_set)
        fn = len(answer_set - pred_set)

        total_tp += tp
        total_fp += fp
        total_fn += fn

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0


    accs.append(score)
    f1.append(f1_value)
    recalls.append(recall)
    precisions.append(precision)

    
scores_others = {"mean_f1": np.mean(f1), "mean_accuracy": np.mean(accs), "recall": np.mean(recalls), "precisions": np.mean(precisions)}

scores_others.update({n: 100 * scores_others[n] for n in scores_others.keys()})


In [None]:
import pandas as pd
comparison_data = {
    "Metric": ["mean_f1", "mean_accuracy", "recall", "precisions"],
    "Scores_Others": [scores_others["mean_f1"], scores_others["mean_accuracy"], scores_others["recall"], scores_others["precisions"]],
    "Scores_Our": [scores_our["mean_f1"], scores_our["mean_accuracy"], scores_our["recall"], scores_our["precisions"]]
}

df_comparison = pd.DataFrame(comparison_data)


In [None]:
df_comparison