# Get pdf

In [1]:
from pathlib import Path
ROOT_DIR = Path().resolve().parent

pdf_path = ROOT_DIR / 'data' / 'pure_pdfs' / "ISS.pdf"

# Extract text

In [2]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        extracted_text = ""
        
        # Iterate over all pages in the PDF
        for page in pdf.pages:
            extracted_text += page.extract_text()
        
    return extracted_text

text = extract_text_from_pdf(pdf_path)
print("Extracted Text:\n", text)

Extracted Text:
 INTEGRATED SERVICE SOLUTIONS
Invoice To:
Direct Produce Supplies LTD
57-63 Church Street
Wimbledon
SW19 5DQ
Invoice Date: 22/11/2023 Invoice No:22806
Quantity Service Description Rate (£) Net Value (£)
178,082 ISS SP - Picking: 12/11/2023 - 18/11/2023 Teynham 0.090 10,641.96
105,084 ISS SP - Picking: 12/11/2023 - 18/11/2023 Linton 0.090 2,856.15
36,065 ISS SP - Picking: 12/11/2023 - 18/11/2023 Sittingbourne 0.090 1,762.74
Net Total (£): 15,260.85
VAT (£): 3,052.17
Bank Details: HSBC UK Bank Plc Payment Terms: 28 days from invoice date Gross Total (£): 18,313.02
Sort Code: 40-03-21
Account No: 2236 6789
Integrated Service Solutions Limited, C/O Fowler Welch, London Road, Teynham, Sittingbourne, ME9 9PR
VAT Registration No: 184 6959 49 Company Registration No: 08332191
Contact
T 01795 523 310
E accounts@issproduce.co.uk
W www.issproduce.co.uk


# Using very small model due to collab limitations

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


print("Loading model and tokenizer...")
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_cache=True)
print("Loaded model and tokenizer")

# reducing model's parameters size
model = model.half()

Loading model and tokenizer...
Loaded model and tokenizer


In [4]:
from jsonformer.main import Jsonformer
from jsonformer.format import highlight_values
import sys
sys.path.append('..')
from configs.configs import invoice_schema, get_prompt

builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=invoice_schema,
    prompt=get_prompt(text),
    temperature=0.05,
    max_array_length=100,
    max_number_tokens=3072,
    max_string_token_length=80
)

print("Generating...")
output = builder()

highlight_values(output)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating...
{
  invoice_number: [32m"22806"[0m,
  invoice_date: [32m"22/11/2023"[0m,
  supplier_name: [32m"Direct Produce Supplies LTD"[0m,
  customer_name: [32m"Integrated Service Solutions Limited"[0m,
  total_net_amount: [32m"15,260.85"[0m,
  total_vat_amount: [32m"3,052.17"[0m,
  total_invoice_amount: [32m"18,313.02"[0m,
  line_items: [
    {
      product_code: [32m"ISS SP - Picking"[0m,
      description: [32m"Service Description"[0m,
      quantity: [32m"178,082"[0m,
      price_per_unit: [32m"0.090"[0m,
      vat_percent: [32m"9.0%"[0m,
      net_price: [32m"10,641.96"[0m
    },
    {
      product_code: [32m"ISS SP - Picking"[0m,
      description: [32m"Service Description"[0m,
      quantity: [32m"105,084"[0m,
      price_per_unit: [32m"0.090"[0m,
      vat_percent: [32m"9.0%"[0m,
      net_price: [32m"2,856.15"[0m
    }
  ],
  due_date: [32m"22/11/2023"[0m,
  payment_date: [32m"22/11/2023"[0m,
  currency: [32m"GBP"[0m,
  payment_

In [5]:
from json2html import json2html
from IPython.display import display, HTML
response = json2html.convert(output)
display(HTML(response))

product_code,description,quantity,price_per_unit,vat_percent,net_price
ISS SP - Picking,Service Description,178082.0,0.09,9.0%,10641.96
ISS SP - Picking,Service Description,105084.0,0.09,9.0%,2856.15
invoice_number,22806,,,,
invoice_date,22/11/2023,,,,
supplier_name,Direct Produce Supplies LTD,,,,
customer_name,Integrated Service Solutions Limited,,,,
total_net_amount,15260.85,,,,
total_vat_amount,3052.17,,,,
total_invoice_amount,18313.02,,,,
line_items,"product_codedescriptionquantityprice_per_unitvat_percentnet_priceISS SP - PickingService Description178,0820.0909.0%10,641.96ISS SP - PickingService Description105,0840.0909.0%2,856.15",,,,

product_code,description,quantity,price_per_unit,vat_percent,net_price
ISS SP - Picking,Service Description,178082,0.09,9.0%,10641.96
ISS SP - Picking,Service Description,105084,0.09,9.0%,2856.15

0,1
iban,HSBC UK Bank Plc
swift,40-03-21
account_number,2236 6789
