# Get pdf

In [1]:
from pathlib import Path
ROOT_DIR = Path().resolve().parent

pdf_path = ROOT_DIR / 'data' / 'pure_pdfs' / "PO166939-204865.pdf"

# Extract text

In [2]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        extracted_text = ""
        
        # Iterate over 1st pages in the PDF
        for page in pdf.pages:
            extracted_text += page.extract_text()
            break
        
    return extracted_text

text = extract_text_from_pdf(pdf_path)
print("Extracted Text:\n", text)

Extracted Text:
 Intake QC Inspection Summary Report
Customer DPS PO 166939 / ISS PO 204865
Supplier Code : GUIM€
04/09/2024
Supplier : Guimera Fruits
Vehicle No :
COO : Spain Vessel : Expected ETA : 03/09/2024 00:00:00
Category : Stone Fruit Haulier : Received : 03/09/2024 09:59:25
DP : 7 - ISS Linton Temperature : Min 4.20; Max 5.10; Avg 4.58 Recorder (s) : NO Inspection Date : 04/09/2024 06:35:59
ISS Cust Supplier Pallet GGN Number Harvest Date End Brix Pressure Maturity% Total Est RAG
Product Details Pallet Pallet ID /Grower D Beis sp tl a By e fU on reti l Customer Packhouse Organic PLU Minor Major Waste QA Comments Avg % Avg Kg 1 2 3 4 5 Defects Yield
Apricots 20x320g 7761770 5808239 5808239 30/08/2024 Tesco NO 3.3% 1.67% 0.00% Dry splits. Scarring. Isolated 16.04 0.00 0, 0, 0, 0, 0 5.0% 100% BLUE
Punnet (FARDA) RAH 00:00:00 puncture. Isolated 80
40/45 10/09/2024 condensation punnets. Isolated
00:00:00 underweight punnets.
10/09/2024 .
00:00:00
Apricots 20x320g 7763099 7763099 Te

# Using very small model due to collab limitations

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


print("Loading model and tokenizer...")
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, use_cache=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_cache=True)
print("Loaded model and tokenizer")

# reducing model's parameters size
model = model.half()

Loading model and tokenizer...
Loaded model and tokenizer


In [4]:
from jsonformer.main import Jsonformer
from jsonformer.format import highlight_values
import sys
sys.path.append('..')
from configs.report_config import qc_report_schema, get_qc_prompt

builder = Jsonformer(
    model=model,
    tokenizer=tokenizer,
    json_schema=qc_report_schema,
    prompt=get_qc_prompt(text),
    temperature=0.05,
    max_array_length=100,
    max_number_tokens=3072,
    max_string_token_length=100
)

print("Generating...")
output = builder()

highlight_values(output)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating...
{
  customer_dps_po: [32m"166939"[0m,
  iss_po: [32m"204865"[0m,
  supplier: [32m"Guimera Fruits"[0m,
  received_date: [32m"2024-04-09"[0m,
  inspection_date: [32m"2024-04-09"[0m,
  category: [32m"Stone Fruit Haulier"[0m,
  coo: [32m"Spain Vessel"[0m,
  line_items: [
    {
      iss_pallet: [32m"204865"[0m,
      suplier_pallet: [32m"Guimera Fruits"[0m,
      organic: [32m"5.0%"[0m,
      brix_avg_%: [32m"4.58"[0m,
      total_defects: [32m"7"[0m,
      EST_yield: [32m"3.3%"[0m,
      RAG: [32m"BLUE"[0m
    },
    {
      iss_pallet: [32m"204865"[0m,
      suplier_pallet: [32m"Guimera Fruits"[0m,
      organic: [32m"5.0%"[0m,
      brix_avg_%: [32m"4.58"[0m,
      total_defects: [32m"7"[0m,
      EST_yield: [32m"3.3%"[0m,
      RAG: [32m"BLUE"[0m
    },
    {
      iss_pallet: [32m"204865"[0m,
      suplier_pallet: [32m"Guimera Fruits"[0m,
      organic: [32m"5.0%"[0m,
      brix_avg_%: [32m"4.58"[0m,
      total_defects: 

In [5]:
from json2html import json2html
from IPython.display import display, HTML
response = json2html.convert(output)
display(HTML(response))

iss_pallet,suplier_pallet,organic,brix_avg_%,total_defects,EST_yield,RAG
204865,Guimera Fruits,5.0%,4.58,7.0,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7.0,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7.0,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7.0,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7.0,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7.0,3.3%,BLUE
customer_dps_po,166939,,,,,
iss_po,204865,,,,,
supplier,Guimera Fruits,,,,,
received_date,2024-04-09,,,,,

iss_pallet,suplier_pallet,organic,brix_avg_%,total_defects,EST_yield,RAG
204865,Guimera Fruits,5.0%,4.58,7,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7,3.3%,BLUE
204865,Guimera Fruits,5.0%,4.58,7,3.3%,BLUE
