In [4]:
from config.settings import SRC_DIR, DATA_DIR
import os
from pathlib import Path
from pdf2image import convert_from_path

import torch

# Change working directory
os.chdir(SRC_DIR)

# Define file path
file_dir = DATA_DIR / 'invoices' / 'valid_invoices'
files = os.listdir(file_dir)
file_path = file_dir / files[0]

In [5]:
file_path = "C:/Users/ai.users/imscanCS/data/invoices/valid_invoices/PerfectMatch.pdf"
output_path = "C:/Users/ai.users/imscanCS/data/invoices/valid_invoices/output"
import fitz  # pymupdf

doc = fitz.open(str(file_path))
for i, page in enumerate(doc):
    pix = page.get_pixmap()
    pix.save(f"{output_path}/page_{i}.png") 

In [6]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers import BitsAndBytesConfig

# Define the model name
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"

# Configure 8-bit loading
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Load the model with 8-bit precision
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"  # Automatically assigns layers to available devices
)

# Load the processor
processor = AutoProcessor.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
def get_prompt():
    prompt = """
    You are an **invoice extractor**. Your task is to extract structured invoice details from the provided image and return **only the JSON output** in the following format:

    {
        "line_items": [
            {
                "product_code": "string",
                "description": "string",
                "quantity": "string",
                "price_per_unit": "string",
                "vat_percent": "string",
                "total_price": "string"
            }
        ],
        "total_amount": {
            "total_items": number,
            "total_tax": "string",
            "total_price": "string"
        },
        "due_date": "YYYY-MM-DD",
        "payment_date": "YYYY-MM-DD",
        "invoice_date": "YYYY-MM-DD",
        "invoice_number": "string",
        "purchase_order": "string",
        "reference_numbers": ["string"],
        "locale": "string",
        "country": "string",
        "currency": "string",
        "payment_details": {
            "iban": "string",
            "swift": "string",
            "bic": "string",
            "account_number": "string"
        },
        "vat_number": "string",
        "supplier_name": "string",
        "taxes_details": [
            {
                "rate": "string",
                "amount": "string"
            }
        ],
        "total_amount_including_taxes": "string",
        "total_net_amount_excluding_taxes": "string",
        "customer_address": "string",
        "shipping_address": "string",
        "billing_address": "string",
        "customer_company_registrations": {
            "vat_number": "string"
        },
        "customer_name": "string",
        "supplier_address": "string"
    }

    **Strict Output Rules:**
    - **Only return the data in JSON format.**
    - **Do not include any explanation, additional text, or non-JSON output.**
    - Precisely capture dates such as invoice date, due date, and payment date. You may find this in many ways like `payment terms`, `payment conditions` etc saying like `within X days`, `X days from date of INVOICE` or similar. If a due date is not directly provided but 'due in X days' is indicated, calculate it by adding X days to the invoice date.**
    - **If any value is missing, set it to `null`. Do not infer missing details or hallucinate.**
    - **Strictly follow the data types and format specified.**
    - **The output must be a valid JSON object without any additional markdown or text.**

    ### Invoice image:

    """
    return prompt

In [8]:
import os
from PIL import Image
import torch

# Define output path where images are saved
outputpath = "C:/Users/ai.users/imscanCS/data/invoices/valid_invoices/output/"
# List all images in the outputpath directory
image_files = [f for f in os.listdir(outputpath) if f.endswith('.jpg') or f.endswith('.png')]

# Process each image file
for i, image_file in enumerate(image_files):
    # Load the image from outputpath
    image_path = os.path.join(outputpath, image_file)
    image = Image.open(image_path)

    # Prepare the messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": get_prompt()},
            ],
        }
    ]

    # Prepare inputs
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)

    # Run inference
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=1024)
        generated_ids_trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)]
        output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)

    print(f"Page {i + 1} Output: {output_text}")

Page 1 Output: ['```json\n{\n    "line_items": [\n        {\n            "product_code": "K37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2",\n            "description": "K37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2",\n            "quantity": "286.00",\n            "price_per_unit": "45.39",\n            "vat_percent": "20.00",\n            "total_price": "13,072.32"\n        }\n    ],\n    "total_amount": {\n        "total_items": 1,\n        "total_tax": "2,614.46",\n        "total_price": "15,686.78"\n    },\n    "due_date": "2023-09-29",\n    "payment_date": null,\n    "invoice_date": "2023-08-30",\n    "invoice_number": "1025991",\n    "purchase_order": "PO# 12183",\n    "reference_numbers": ["D0T 82303766", "D0T 82305223"],\n    "locale": "en",\n    "country": "GB",\n    "currency": "GBP",\n    "payment_details": {\n        "iban": "GB22 BARC 2022 7550 7848 34",\n        "swift": "BARCGB22",\n        "bic": "BARCGB22",\n        "account_number": "507848

In [9]:
import re, json
def extract(response_text):
    '''Extracts JSON data from model response.'''
    match = re.search(r"\{.*\}", response_text, re.DOTALL)
    if match:
        extracted_data = json.loads(match.group(0))
        return extracted_data
    raise ValueError("No valid JSON found in response")

result = extract(output_text[0])

from IPython.display import display, HTML
from json2html import json2html

html_response = json2html.convert(json=result)
display(HTML(html_response))

product_code,description,quantity,price_per_unit,vat_percent,total_price
rate,amount,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
K37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2,K37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2,286.0,45.39,20.0,13072.32
20.00%,2614.46,,,,
line_items,"product_codedescriptionquantityprice_per_unitvat_percenttotal_priceK37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2K37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2286.0045.3920.0013,072.32",,,,
total_amount,"total_items1total_tax2,614.46total_price15,686.78",,,,
due_date,2023-09-29,,,,
payment_date,,,,,
invoice_date,2023-08-30,,,,
invoice_number,1025991,,,,
purchase_order,PO# 12183,,,,
reference_numbers,D0T 82303766D0T 82305223,,,,

product_code,description,quantity,price_per_unit,vat_percent,total_price
K37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2,K37F/40mm Punnets + *100% PAPER* Black Soak Pad x 950 pcs w2,286.0,45.39,20.0,13072.32

0,1
total_items,1.0
total_tax,2614.46
total_price,15686.78

0,1
iban,GB22 BARC 2022 7550 7848 34
swift,BARCGB22
bic,BARCGB22
account_number,50784834

rate,amount
20.00%,2614.46

0,1
vat_number,700088278


In [18]:
from config.settings import ROOT_DIR
import os
output_folder = ROOT_DIR / 'invoice_outputs'/ model_name.split('/')[-1]
output_folder.mkdir(exist_ok=True)
file_name = str(file_path).split('/')[-1][:-4]
import json
# Define full file path
json_file_path = output_folder / f"{file_name}.json"

# Save JSON file
with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(result, json_file, indent=4)

print(f"JSON saved at: {json_file_path}")

JSON saved at: C:\Users\ai.users\imscanCS\invoice_outputs\Qwen2.5-VL-7B-Instruct\PerfectMatch.json
