In [1]:
import sys
sys.path.append('..')
from configs import ROOT_DIR
import os

import polars as pl
BENCHMARKS_DIR = ROOT_DIR / 'benchmarks'
BENCHMARKS_DIR.mkdir(exist_ok=True)
os.listdir(BENCHMARKS_DIR)



In [2]:
from components.data_ingestion import DataIngestion
from components.model import OCR_Model

pdf_path = BENCHMARKS_DIR / 'invoices'
json_path = BENCHMARKS_DIR / 'groundtruths'
data_ingestion = DataIngestion()
invoice_pdf = data_ingestion.transform(pdf_path/ os.listdir(pdf_path)[1])

In [3]:
from configs import EMPTY_RETURN_PROMPT
model = OCR_Model(prompt=EMPTY_RETURN_PROMPT)
invoice_response = model.extract(invoice_pdf)

In [39]:
import json
with open(json_path/ os.listdir(json_path)[1], 'r') as f:
    groundtruth = json.load(f)

In [40]:
gt_headers = groundtruth['headers']
gt_headers.pop("filename")
pred_headers = invoice_response['headers']

gt_items = groundtruth['line_items']
pred_items = invoice_response['line_items']

In [41]:
from IPython.display import display, HTML
import json2html
print("Groundtruth Headers")
display(HTML(json2html.json2html.convert(json = gt_headers)))
print("Predicted Headers")
display(HTML(json2html.json2html.convert(json = pred_headers)))
print("Groundtruth Line Items")
display(HTML(json2html.json2html.convert(json = gt_items)))
print("Predicted Line Items")
display(HTML(json2html.json2html.convert(json = pred_items)))

Groundtruth Headers


0,1
invNo,90633842
invDate,2024-03-02
orderNo,996991
amountNet,375
amountVat,0.0
amountTotal,375.0
suppName,PORK FARMS QUEENS DRIVE


Predicted Headers


0,1
invNo,090633842
invDate,2024-02-03
orderNo,996991/0
amountNet,375.0
amountVat,0.0
amountTotal,375.0
suppName,Pork Farms Ltd


Groundtruth Line Items


ItemPosition,ProductCode,Description,Quantity,UnitPrice,ItemVatRate,TotalAmount
1,268843,P/FARMS CRISPY BACON 50G 8,30.0,12.5,0.0,375.0


Predicted Line Items


ItemPosition,ProductCode,Description,Quantity,UnitPrice,ItemVatRate,TotalAmount
1,6030847,PF 50G MAPLE BACON STRIPS X8,30.0,12.5,0.0,375.0


In [42]:
from fuzzywuzzy import fuzz

correct = 0
total = len(gt_headers)

for key in gt_headers:
    if key in pred_headers:
        if isinstance(gt_headers[key], str):
            similarity = fuzz.ratio(gt_headers[key], str(pred_headers[key])) / 100
            correct += similarity
        else:
            try:
                correct += 1 if float(gt_headers[key]) == float(pred_headers[key]) else 0
            except ValueError:
                similarity = fuzz.ratio(str(gt_headers[key]), str(pred_headers[key])) / 100
                correct += similarity

header_accuracy = (correct / total) * 100
print(f"Header Accuracy: {header_accuracy:.2f}%")

Header Accuracy: 79.57%


In [43]:
correct_items = 0
total_items = len(gt_items)

for gt, pred in zip(gt_items, pred_items):
    desc_score = fuzz.ratio(gt["Description"], pred["Description"]) / 100
    qty_match = 1 if gt["Quantity"] == pred["Quantity"] else 0
    price_match = 1 if gt["UnitPrice"] == pred["UnitPrice"] else 0
    vat_match = 1 if abs(gt["ItemVatRate"] - pred["ItemVatRate"]) <= 3 else 0  # Allow small variation
    total_match = 1 if gt["TotalAmount"] == pred["TotalAmount"] else 0

    item_score = (desc_score + qty_match + price_match + vat_match + total_match) / 5
    correct_items += item_score

line_item_accuracy = (correct_items / total_items) * 100
print(f"Line Item Accuracy: {line_item_accuracy:.2f}%")

Line Item Accuracy: 88.80%
