######## TESTING ###########

In [1]:
import gc
import re
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from donut import JSONParseEvaluator
from datasets import load_dataset
from transformers import DonutProcessor, VisionEncoderDecoderModel


device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from config import model_name_10,model_name_30,model_name_base
# model_name_10 : model trained for 10 epochs
# model_name_30 : model trained for 30 epochs
# model_name_base : Base model

In [3]:
processor = DonutProcessor.from_pretrained(model_name_10)
model = VisionEncoderDecoderModel.from_pretrained(model_name_10)

model.eval()
model.to(device)

f1 = []
accs = []
recalls = []
precisions = []

evaluator = JSONParseEvaluator()
dataset = load_dataset("Rajan/AIMT-invoices-donut-data", split="test")
for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # Prepare encoder inputs
    pixel_values = processor(
        sample["image"].convert("RGB"), return_tensors="pt"
    ).pixel_values
    pixel_values = pixel_values.to(device)
    # Prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(
        task_prompt, add_special_tokens=False, return_tensors="pt"
    ).input_ids
    decoder_input_ids = decoder_input_ids.to(device)

    # Autoregressively generate sequence
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    # Turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(
        processor.tokenizer.pad_token, ""
    )
    seq = re.sub(
        r"<.*?>", "", seq, count=1
    ).strip()  # remove first task start token
    seq = processor.token2json(seq)

    ground_truth = json.loads(sample["ground_truth"])["gt_parse"]

    score = evaluator.cal_acc(seq, ground_truth)
    f1_value = evaluator.cal_f1(seq, ground_truth)

    total_tp, total_fp, total_fn = 0, 0, 0
    for pred, answer in zip(seq, ground_truth):
        pred, answer = evaluator.flatten(
            evaluator.normalize_dict(pred)
        ), evaluator.flatten(evaluator.normalize_dict(answer))
        answer_set = set(answer)
        pred_set = set(pred)

        tp = len(pred_set & answer_set)
        fp = len(pred_set - answer_set)
        fn = len(answer_set - pred_set)

        total_tp += tp
        total_fp += fp
        total_fn += fn

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    accs.append(score)
    f1.append(f1_value)
    recalls.append(recall)
    precisions.append(precision)

scores_10_epoch = {
    "mean_f1": np.mean(f1),
    "mean_accuracy": np.mean(accs),
    "recall": np.mean(recalls),
    "precisions": np.mean(precisions),
}
# scores_10_epoch.update({n: 100 * scores_10_epoch[n] for n in scores_10_epoch.keys()})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 26/26 [01:53<00:00,  4.36s/it]


In [4]:
# clear cache for other model
torch.cuda.empty_cache()
gc.collect()


109

In [5]:
processor = DonutProcessor.from_pretrained(model_name_base)
model = VisionEncoderDecoderModel.from_pretrained(model_name_base)

model.eval()
model.to(device)

f1 = []
accs = []
recalls = []
precisions = []

evaluator = JSONParseEvaluator()

for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # Prepare encoder inputs
    pixel_values = processor(
        sample["image"].convert("RGB"), return_tensors="pt"
    ).pixel_values
    pixel_values = pixel_values.to(device)
    # Prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(
        task_prompt, add_special_tokens=False, return_tensors="pt"
    ).input_ids
    decoder_input_ids = decoder_input_ids.to(device)

    # Autoregressively generate sequence
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    # Turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(
        processor.tokenizer.pad_token, ""
    )
    seq = re.sub(
        r"<.*?>", "", seq, count=1
    ).strip()  # remove first task start token
    seq = processor.token2json(seq)

    ground_truth = json.loads(sample["ground_truth"])["gt_parse"]

    score = evaluator.cal_acc(seq, ground_truth)
    f1_value = evaluator.cal_f1(seq, ground_truth)

    total_tp, total_fp, total_fn = 0, 0, 0
    for pred, answer in zip(seq, ground_truth):
        pred, answer = evaluator.flatten(
            evaluator.normalize_dict(pred)
        ), evaluator.flatten(evaluator.normalize_dict(answer))
        answer_set = set(answer)
        pred_set = set(pred)

        tp = len(pred_set & answer_set)
        fp = len(pred_set - answer_set)
        fn = len(answer_set - pred_set)

        total_tp += tp
        total_fp += fp
        total_fn += fn

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    accs.append(score)
    f1.append(f1_value)
    recalls.append(recall)
    precisions.append(precision)

scores_base = {
    "mean_f1": np.mean(f1),
    "mean_accuracy": np.mean(accs),
    "recall": np.mean(recalls),
    "precisions": np.mean(precisions),
}
# scores_base.update({n: 100 * scores_base[n] for n in scores_base.keys()})

100%|██████████| 26/26 [01:53<00:00,  4.38s/it]


In [6]:
# clear cache for other model
torch.cuda.empty_cache()
gc.collect()


81

In [7]:
processor = DonutProcessor.from_pretrained(model_name_30)
model = VisionEncoderDecoderModel.from_pretrained(model_name_30)

model.eval()
model.to(device)

f1 = []
accs = []
recalls = []
precisions = []

evaluator = JSONParseEvaluator()
dataset = load_dataset("Rajan/AIMT-invoices-donut-data", split="test")
dataset = dataset.select(range(17))
for idx, sample in tqdm(enumerate(dataset), total=len(dataset)):
    # Prepare encoder inputs
    pixel_values = processor(
        sample["image"].convert("RGB"), return_tensors="pt"
    ).pixel_values
    pixel_values = pixel_values.to(device)
    # Prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(
        task_prompt, add_special_tokens=False, return_tensors="pt"
    ).input_ids
    decoder_input_ids = decoder_input_ids.to(device)

    # Autoregressively generate sequence
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    # Turn into JSON
    seq = processor.batch_decode(outputs.sequences)[0]
    seq = seq.replace(processor.tokenizer.eos_token, "").replace(
        processor.tokenizer.pad_token, ""
    )
    seq = re.sub(
        r"<.*?>", "", seq, count=1
    ).strip()  # remove first task start token
    seq = processor.token2json(seq)

    ground_truth = json.loads(sample["ground_truth"])["gt_parse"]

    score = evaluator.cal_acc(seq, ground_truth)
    f1_value = evaluator.cal_f1(seq, ground_truth)

    total_tp, total_fp, total_fn = 0, 0, 0
    for pred, answer in zip(seq, ground_truth):
        pred, answer = evaluator.flatten(
            evaluator.normalize_dict(pred)
        ), evaluator.flatten(evaluator.normalize_dict(answer))
        answer_set = set(answer)
        pred_set = set(pred)

        tp = len(pred_set & answer_set)
        fp = len(pred_set - answer_set)
        fn = len(answer_set - pred_set)

        total_tp += tp
        total_fp += fp
        total_fn += fn

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0

    accs.append(score)
    f1.append(f1_value)
    recalls.append(recall)
    precisions.append(precision)

scores_30epochs = {
    "mean_f1": np.mean(f1),
    "mean_accuracy": np.mean(accs),
    "recall": np.mean(recalls),
    "precisions": np.mean(precisions),
}
# scores_30epochs.update({n: 100 * scores_base[n] for n in scores_base.keys()})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 17/17 [01:16<00:00,  4.53s/it]


In [8]:
import pandas as pd
comparison_data = {
    "Metric": ["mean_f1", "mean_accuracy", "mean_recall", "mean_precision"],
    "score_10_epochs": [scores_10_epoch["mean_f1"], scores_10_epoch["mean_accuracy"], scores_10_epoch["recall"], scores_10_epoch["precisions"]],
    "score_30_epochs": [scores_30epochs["mean_f1"], scores_30epochs["mean_accuracy"], scores_30epochs["recall"], scores_30epochs["precisions"]],
    "Score_Base": [scores_base["mean_f1"], scores_base["mean_accuracy"], scores_base["recall"], scores_base["precisions"]],

}

df_comparison = pd.DataFrame(comparison_data)

In [9]:
df_comparison

Unnamed: 0,Metric,score_10_epochs,score_30_epochs,Score_Base
0,mean_f1,0.230769,0.941176,0.961538
1,mean_accuracy,0.327561,0.927053,0.949793
2,mean_recall,0.230769,0.941176,0.961538
3,mean_precision,0.230769,0.941176,0.961538
