In [1]:
from openai import OpenAI
from config.open_ai.dps_qc_report import get_qc_prompt

prompt = get_qc_prompt()

client = OpenAI()

assistant = client.beta.assistants.create(
  name="DPS QC Report Assistant",
  instructions=prompt,
  model="gpt-4o",
)

In [2]:
from config.settings import SRC_DIR
import os
os.chdir(SRC_DIR)

In [3]:
from document_processing_system.components.data_ingestion import DataIngestion
from document_processing_system.components.data_extraction import DataExtraction

pdf = 'PO166939-204865'

data_ingester = DataIngestion(f'qc_data/qc_templates/{pdf}.pdf')
data_extracter = DataExtraction(data_ingester)
pages, tables, images = data_extracter.extract_data()
all_items = data_extracter.filter_pages(pages)
data_extracter.display_data(all_items)

Extracting pages:   0%|          | 0/15 [00:00<?, ?pages/s]

Filtering pages:   0%|          | 0/15 [00:00<?, ?pages/s]


--------Page: 2-----------
Intake Pallet QC Inspection Report
Customer DPS PO 166939 / ISS PO 204865
Supplier Code : GUIM€
Supplier : Guimera Fruits
COO : Spain
Vehicle No : Received : 03/09/2024 09:59:25
Vessel : Inspection Date : 04/09/2024 06:35:59 Print date : 04/09/2024
Apricots 20x320g Punnet
ISS Pallet ID : 7761770 Freshness Technology :
Supplier Pallet ID : 5808239 Punnet / Pad Type : N/A /
Customer Pallet ID : 5808239 Outer :
Variety : Fardao Brand : CORE
Grower : / Organic? : NO Does Pallet Meet Spec? : YES
GGN : / PLU? :
BLUE Expected Qty 80
Orchard/Farm : End Customer : Tesco
Received Qty 80
Harvest Date : 30/08/2024 DP : 7
Total Defects : 5.00%
Size/Calibre : 40/45 Packhouse :
Estimated Yield : 100%
Lot Number : 508164 Inspector : Hanna.Dziuba
Minor : Dry Splits: 1.67%
Minor : Puncture: 1.67%
Major : Scarring: 1.67%
Defects Tot : 0.00% Defects Fruit Total : 0 Packs With Defects : 0.00%
Waste Tot : 0.00% Waste Fruit Total : 0 Packs With Waste : 0%
Minor Defects Tot : 3.33%

In [4]:
data = ''
for item in all_items:
    data += item

data

'\n--------Page: 2-----------\nIntake Pallet QC Inspection Report\nCustomer DPS PO 166939 / ISS PO 204865\nSupplier Code : GUIM€\nSupplier : Guimera Fruits\nCOO : Spain\nVehicle No : Received : 03/09/2024 09:59:25\nVessel : Inspection Date : 04/09/2024 06:35:59 Print date : 04/09/2024\nApricots 20x320g Punnet\nISS Pallet ID : 7761770 Freshness Technology :\nSupplier Pallet ID : 5808239 Punnet / Pad Type : N/A /\nCustomer Pallet ID : 5808239 Outer :\nVariety : Fardao Brand : CORE\nGrower : / Organic? : NO Does Pallet Meet Spec? : YES\nGGN : / PLU? :\nBLUE Expected Qty 80\nOrchard/Farm : End Customer : Tesco\nReceived Qty 80\nHarvest Date : 30/08/2024 DP : 7\nTotal Defects : 5.00%\nSize/Calibre : 40/45 Packhouse :\nEstimated Yield : 100%\nLot Number : 508164 Inspector : Hanna.Dziuba\nMinor : Dry Splits: 1.67%\nMinor : Puncture: 1.67%\nMajor : Scarring: 1.67%\nDefects Tot : 0.00% Defects Fruit Total : 0 Packs With Defects : 0.00%\nWaste Tot : 0.00% Waste Fruit Total : 0 Packs With Waste :

In [18]:
import openai
from pydantic import BaseModel
from typing import List, Optional
import json

# Define the Pydantic model
class QualityControlReportExtraction(BaseModel):
    product_name: str
    RAG: str
    expected_qty: str
    received_qty: str
    supplier_code: str
    supplier: str
    coo: str
    received_date: str
    inspection_date: str
    print_date: str
    iss_pallet_id: str
    supplier_pallet_id: str
    customer_pallet_id: str
    variety: str
    brand: str
    organic: str
    does_pallet_meet_spec: str
    end_customer: str
    harvest_date: str
    dp: str
    total_defects: int
    size_calibre: str
    lot_number: str
    inspector: str
    estimated_yield: str
    defects_tot: int
    defects_fruit_total: int
    packs_with_defects: int
    waste_tot: int
    waste_fruit_total: int
    packs_with_waste: int
    minor_defects_tot: int
    minor_fruit_total: int
    major_defects_tot: int
    major_fruit_total: int
    packs_with_major: int
    box_pack_weights: str
    weight_readings: str
    fruit_weights: str
    qa_comments: Optional[str] = None
    packs_fruits_inspected_sample_size: int
    boxes_inspected: int

# Define the prompt
REPORT_PROMPT = """
You are an advanced Quality Control Report Extractor. Your task is to extract structured quality control details from 
the provided PDF and return a **list of valid JSON objects**, each strictly conforming to the schema below.

### **Instructions:**
1. **Skip the first few pages** containing summary reports.
2. **Each relevant page contains exactly one product's details.**
3. **Ignore pages that contain only images** or do not have structured text data.
4. **Extract only what is explicitly mentioned**; do not infer missing values.

### **Schema for Each Product:**
Each extracted product must contain the following fields exactly as they appear in the document:

- `product_name` (string): Name of the product.
- `RAG` (string): RAG value (e.g., "BLUE", "AMBER", "GREEN").
- `expected_qty` (string): Expected quantity.
- `received_qty` (string): Received quantity.
- `supplier_code` (string): Supplier code.
- `supplier` (string): Supplier name.
- `coo` (string): Country of Origin.
- `received_date` (string): Date/time when received, format: `"YYYY-MM-DD HH:MM:SS"`.
- `inspection_date` (string): Date/time of inspection, format: `"YYYY-MM-DD HH:MM:SS"`.
- `print_date` (string): Print date, format: `"YYYY-MM-DD"`.
- `iss_pallet_id` (string): ISS Pallet ID.
- `supplier_pallet_id` (string): Supplier Pallet ID.
- `customer_pallet_id` (string): Customer Pallet ID.
- `variety` (string): Variety of the product.
- `brand` (string): Brand name.
- `end_customer` (string): End customer.
- `harvest_date` (string): Date harvested, format: `"YYYY-MM-DD"`.
- `dp` (string): DP code.
- `total_defects` (integer): Total number of defects.
- `size_calibre` (string): Size calibre.
- `lot_number` (string): Lot number.
- `inspector` (string): Inspector's name.
- `estimated_yield` (string): Estimated yield.
- `defects_tot` (integer): Total defects count.
- `defects_fruit_total` (integer): Total defects in the fruit.
- `packs_with_defects` (integer): Number of packs with defects.
- `waste_tot` (integer): Total waste.
- `waste_fruit_total` (integer): Fruit waste count.
- `fruit_weights` (string): Fruit weights.
- `qa_comments` (string): Any QA comments.
- `packs_fruits_inspected_sample_size` (integer): Sample size for inspection.
- `boxes_inspected` (integer): Number of boxes inspected.

### **Extraction Rules:**
- **DO NOT infer missing values.** If a field is absent or empty in the document, return `null`.
- **Extract values exactly as they appear** without modification.
- **Do not generate assumptions** about missing data.
- **Ensure valid JSON output** where all extracted products are structured as a list of JSON objects.
"""

# Define a function to send the request for extraction
def get_qc_prompt(prompt: str = REPORT_PROMPT):
    return prompt

# Requesting extraction from OpenAI API using the new method
response = openai.beta.chat.completions.parse(
    model="gpt-4",
    messages=[
        {"role": "user", "content": data[:5000]}
    ],
    max_tokens=3000,
    temperature=0.5,
    stop=["\n"],
    response_format=QualityControlReportExtraction
)

# Parse the JSON response
response_text = response['choices'][0]['message']['content']
# Convert text to JSON
try:
    qc_reports = json.loads(response_text)
    products = [QualityControlReportExtraction(**report) for report in qc_reports]  # Convert to Pydantic models
    # Example: Iterate over the extracted list
    for product in products:
        print(product.json())
except Exception as e:
    print(f"Error parsing the response: {e}")


BadRequestError: Error code: 400 - {'error': {'message': "Invalid parameter: 'response_format' of type 'json_schema' is not supported with this model. Learn more about supported models at the Structured Outputs guide: https://platform.openai.com/docs/guides/structured-outputs", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [14]:
response

ParsedChatCompletion[NoneType](id='chatcmpl-AzMAtWyRLt66tDFPSuekraUMxpdzx', choices=[ParsedChoice[NoneType](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[NoneType](content='[', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=None))], created=1739186611, model='gpt-4-0613', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=1, prompt_tokens=2610, total_tokens=2611, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [None]:

# Example: Iterate over the extracted list
for product in qc_reports:
    print(product.json())

QualityControlReportExtraction(product_name='Apricots 20x320g Punnet', RAG='BLUE', expected_qty='80', received_qty='80', supplier_code='GUIM€', supplier='Guimera Fruits', coo='Spain', received_date='2024-09-03 09:59:25', inspection_date='2024-09-04 06:35:59', print_date='2024-09-04', iss_pallet_id='7761770', supplier_pallet_id='5808239', customer_pallet_id='5808239', variety='Fardao', brand='CORE', organic='NO', does_pallet_meet_spec='YES', end_customer='Tesco', harvest_date='2024-08-30', dp='7', total_defects=5, size_calibre='40/45', lot_number='508164', inspector='Hanna.Dziuba', estimated_yield='100%', defects_tot=0, defects_fruit_total=0, packs_with_defects=0, waste_tot=0, waste_fruit_total=0, packs_with_waste=0, minor_defects_tot=3, minor_fruit_total=2, major_defects_tot=1, major_fruit_total=1, packs_with_major=0, box_pack_weights='Avg 340g ; Min 326g; Max 367g Underweight 0%', weight_readings='326 326 328 330 330 331 332 333 334 334 342 343 343 347 349 350 352 353 353 367', fruit_