# Get pdf

In [1]:
from pathlib import Path
ROOT_DIR = Path().resolve().parent

pdf_dir = ROOT_DIR / 'data' / 'qc_templates'

import os

files = os.listdir(pdf_dir)
pdfs = [file for file in files if file.endswith('.pdf')]
pdf_path = pdf_dir / pdfs[-1]

# Extract text

In [2]:
import pdfplumber
from tqdm.auto import tqdm

def extract_text_and_tables(pdf_path):
    all_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_number in tqdm(range(len(pdf.pages)), desc="Processing Pages", unit="page"):
            page = pdf.pages[page_number]

            # Extract text
            text = page.extract_text()
            all_text.append(f"\n--- Page {page_number + 1} ---\n" + (text if text else ""))

    return all_text

all_text = extract_text_and_tables(pdf_path)
all_text

Processing Pages:   0%|          | 0/30 [00:00<?, ?page/s]

['\n--- Page 1 ---\nIntake QC Inspection Summary Report 19/01/2025\nCustomer GNK€ PO 177193\nSupplier Code :GNK€\nSupplier :G.GIANNAKAKIS SA\nCOO : Greece\nVehicle No : Expected ETA : 18/01/2025 00:00:00\nVessel : DELAPORTAS Haulier : Received : 19/01/2025 08:15:09\nDP : 51 - Glinwell Temperature : Min 0.00; Max 0.00; Avg 0.00 Recorder (s) : NO Inspection Date : 19/01/2025 12:32:00\nProduct Details Pallet Cust Supplier Pallet GGN Number Harvest End Packhouse Organic PLU Minor Major Waste QA Comments Brix Pressure Maturity% Total Est RAG\nID Pallet ID /Grower Date Customer Avg % Avg Kg 1 2 3 4 5 Defects Yield\nCucumbers 18 Loose 6194840 6194840 Tesco NO 5.6% 0.00% 0.00% No major issues, flowering 0.00 0.00 0, 0, 0, 0, 0 5.6% 100.00 BLUE\n(N/A) LSEEX and untidy wrap present. 150\nCucumbers 18 Loose 6194843 6194843 Tesco NO 5.6% 0.00% 0.00% No major issues, flowering 0.00 0.00 0, 0, 0, 0, 0 5.6% 100.00 BLUE\n(N/A) LSEEX and untidy wrap present. 150\nCucumbers 28 Loose 6194842 6194842 Tesc

In [3]:
import re
# Regular expression pattern to match "Intake Pallet QC Inspection Report" with a page header
pattern = re.compile(r"^\n--- Page \d+ ---\nIntake Pallet QC Inspection Report", re.MULTILINE)

# Filtering the list
filtered_blocks = [block for block in all_text if pattern.search(block)]
last_block_len = len(filtered_blocks[-1])

# remove last pages with only images
final_items = [block for block in filtered_blocks if len(block)>last_block_len]

print(final_items[0])


--- Page 4 ---
Intake Pallet QC Inspection Report
Customer GNK€ PO 177193
Supplier Code: GNK€
Supplier : G.GIANNAKAKIS SA
COO: Greece
Vehicle No : Received : 19/01/2025 08:15:09
Vessel: DELAPORTAS Inspection Date : 19/01/2025 12:32:00 Print date : 19/01/2025
Cucumbers 18 Loose
Pallet ID : 6194840 Freshness Technology :
Supplier Pallet ID : 6194840 Punnet / Pad Type : / N/A
Customer Pallet ID : Outer : / Tray Type :
Variety : Not Available Brand : LSEEX / Class : 1
Grower : / Organic? : NO Does Pallet Meet Spec? : YES
GGN : PLU? :
BLUE Expected Qty 150
Orchard/Farm : End Customer : Tesco
Received Qty 150
Harvest Date : DP : 51
Total Defects : 5.56%
Size/Calibre : Packhouse :
Estimated Yield : 100.00%
Lot Number : 676779 Inspector : Marzena.Buslowicz
BB/DU Date : 27/01/2025 Ribon/Promo : N/A / N/A
Alphanumeric Date Code : A27
Minor : Miswrap/Untidy Wrap: 4.63%
Minor : Flowering: 0.93%
BB/DU Printed match on Pack and Label? : YES Weight Printed: 260/360 Label Colour : YELLOW
COO Printed 

# Using very small model due to collab limitations

In [4]:
import os
import json
import torch
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import sys

# Ensure the parent directory is in the path
sys.path.append('..')
from configs.report_config import get_qc_prompt

# Load 4-bit Quantized Model
print("Loading model and tokenizer...")

model_name = "Qwen/Qwen2.5-14B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.bfloat16  # Set computation dtype
)

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,  # Apply quantization during loading
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

print("Model loaded successfully!")

# Create output directory
output_folder = "qc_reports_14b"
os.makedirs(output_folder, exist_ok=True)

# Function to generate JSON output directly from the model
def generate_json_output(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(**inputs)
    
    # Decode and clean up response
    response = tokenizer.decode(outputs[0])
    return response

responses = []
# Process each page
for i, page in enumerate(tqdm(final_items, desc="Processing Pages", unit="page")):
    prompt = get_qc_prompt(page)  # Get structured prompt
    response = generate_json_output(prompt)  # Ask model directly for JSON
    responses.append(response)
    if i > 1:
        break
    # Save JSON output
    # output_path = os.path.join(output_folder, f"qc_report_page_{i}.json")
    # with open(output_path, "w", encoding="utf-8") as json_file:
    #     json.dump(response, json_file, indent=4, ensure_ascii=False)

    # print(f"Saved: {output_path}")

Loading model and tokenizer...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Model loaded successfully!


Processing Pages:   0%|          | 0/26 [00:00<?, ?page/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [5]:
for response in responses:
    print(response[len(prompt):])

5 ---
Intake Pallet QC Inspection Report
Customer GNK€ PO 177193
Supplier Code: GNK€
Supplier : G.GIANNAKAKIS SA
COO: Greece
Vehicle No : Received : 19/01/2025 08:15:09
Vessel: DELAPORTAS Inspection Date : 19/01/2025 12:32:00 Print date : 19/01/2025
Cucumbers 18 Loose
Pallet ID : 6194840 Freshness Technology :
Supplier Pallet ID : 6194840 Punnet / Pad Type : / N/A
Customer Pallet ID : Outer : / Tray Type :
Variety : Not Available Brand : LSEEX / Class : 1
Grower : / Organic? : NO Does Pallet Meet Spec? : YES
GGN : PLU? :
BLUE Expected Qty 150
Orchard/Farm : End Customer : Tesco
Received Qty 150
Harvest Date : DP : 51
Total Defects : 5.56%
Size/Calibre : Packhouse :
Estimated Yield : 100.00%
Lot Number : 676779 Inspector : Marzena.Buslowicz
BB/DU Date : 27/01/2025 Ribon/Promo : N/A / N/A
Alphanumeric Date Code : A27
Minor : Miswrap/Untidy Wrap: 4.63%
Minor : Flowering: 0.93%
BB/DU Printed match on Pack and Label? : YES Weight Printed: 260/360 Label Colour : YELLOW
COO Printed match on P

In [11]:
for response in responses:
    print(response[len(prompt):])

xes Inspected : 108
--- Page 5 ---
Intake Pallet QC Inspection Report
Customer GNK€ PO 177193
Supplier Code: GNK€
Supplier : G.GIANNAKAKIS SA
COO: Greece
Vehicle No : Received : 19/01/2025 08:15:09
Vessel: DELAPORTAS Inspection Date : 19/01/2025 12:32:00 Print date : 19/01/2025
Cucumbers 18 Loose
Pallet ID : 6194840 Freshness Technology :
Supplier Pallet ID : 6194840 Punnet / Pad Type : / N/A
Customer Pallet ID : Outer : / Tray Type :
Variety : Not Available Brand : LSEEX / Class : 1
Grower : / Organic? : NO Does Pallet Meet Spec? : YES
GGN : PLU? :
BLUE Expected Qty 150
Orchard/Farm : End Customer : Tesco
Received Qty 150
Harvest Date : DP : 51
Total Defects : 5.56%
Size/Calibre : Packhouse :
Estimated Yield : 100.00%
Lot Number : 676779 Inspector : Marzena.Buslowicz
BB/DU Date : 27/01/2025 Ribon/Promo : N/A / N/A
Alphanumeric Date Code : A27
Minor : Miswrap/Untidy Wrap: 4.63%
Minor : Flowering: 0.93%
BB/DU Printed match on Pack and Label? : YES Weight Printed: 260/360 Label Colour : 

In [7]:
def extract_json_objects(text):
    json_pattern = r'\{.*?\}'
    json_matches = re.findall(json_pattern, text, re.DOTALL)  # Find all JSON objects
    json_objects = []
    
    for match in json_matches:
        try:
            json_obj = json.loads(match)  # Convert to dictionary
            json_objects.append(json_obj)
        except json.JSONDecodeError:
            continue  # Ignore invalid JSON
    
    return json_objects

extract_json_objects(responses[0])

[]

In [10]:
print(responses[0])

You are an advanced quality control report extractor. Extract structured quality control details from the provided text and return a **valid JSON object** that strictly follows this schema. **Do not return extra text, explanations, or comments.**

### **Schema:**
{
  "product_name": "string",
  "RAG": "string",
  "expected_qty": "string",
  "received_qty": "string",
  "supplier_code": "string",
  "supplier": "string",
  "coo": "string",
  "received_date": "YYYY-MM-DD HH:MM:SS",
  "inspection_date": "YYYY-MM-DD HH:MM:SS",
  "print_date": "YYYY-MM-DD",
  "iss_pallet_id": "integer",
  "supplier_pallet_id": "integer",
  "customer_pallet_id": "integer",
  "variety": "string",
  "brand": "string",
  "organic": "YES" or "NO",
  "does_pallet_meet_spec": "YES" or "NO",
  "end_customer": "string",
  "harvest_date": "YYYY-MM-DD",
  "dp": "string",
  "total_defects": "integer",
  "size_calibre": "integer",
  "lot_number": "string",
  "inspector": "string",
  "estimated_yield": "string",
  "defects