In [1]:
from pydantic import BaseModel
from openai import OpenAI
import json
import base64
from typing import List, Optional
from dotenv import load_dotenv
load_dotenv()


class InvoiceLineItem(BaseModel):
    service_description: str
    amount_without_vat: float
    quantity: int
    total_amount: float

class InvoiceHeader(BaseModel):
    invoice_no: str
    customer_no: str
    invoice_period: str
    date: str
    customer_name: str
    customer_address: str
    vat_no: Optional[str] = None
    contact_person_name: Optional[str] = None
    contact_person_phone: Optional[str] = None

class Invoice(BaseModel):
    header: InvoiceHeader
    line_items: List[InvoiceLineItem]
    total_without_vat: float
    vat_amount: float
    gross_amount_incl_vat: float
    terms_of_payment: str
    bank_iban: str
    bank_bic: str

# Instantiate OpenAI client
client = OpenAI()

# Load & encode the PDF
with open("sample-invoice.pdf", "rb") as f:
    pdf_b64 = base64.b64encode(f.read()).decode()


# Create chat completion with PDF embedded env  
response = client.responses.parse(
    model="gpt-4.1-mini",
    input=[
        {"role": "system", "content": "You are a precise invoice parser. you will receive a PDF invoice and must extract structured data from it."},
        {"role": "user", "content": f"data:application/pdf;base64,{pdf_b64}"}
    ],
    text_format=Invoice,
)


In [22]:
data = json.loads(response.output[0].content[0].text)

In [23]:

# Save structured output
with open("invoice_output.json", "w") as f:
    print("Saving structured invoice data... ")
    f.write(json.dumps(data, indent=2))

Saving structured invoice data... 
