Assuming that landing ai and langsmith api key was exported appropriately in the environment.

In [30]:
import logging

logging.basicConfig(level=logging.DEBUG)

In [32]:
import os
import json
import requests
from langsmith import traceable
# , add_tags, set_custom_metadata

@traceable(name="Landing AI PDF Schema Extraction")
def extract_schema_from_pdf(pdf_path: str, schema_path: str) -> dict:
    # Load Landing AI key from env
    landing_ai_api_key = os.environ["LANDING_AI_API_KEY"]

    headers = {"Authorization": f"Basic {landing_ai_api_key}"}
    url = "https://api.va.landing.ai/v1/tools/agentic-document-analysis"

    pdf_name = os.path.basename(pdf_path)
    schema_name = os.path.basename(schema_path)
    
    print(pdf_name)
    print(schema_name)

    # Add tags and metadata for trace
    # add_tags(["landing-ai", "pdf", "schema-extraction"])
    # set_custom_metadata({
    #     "pdf_name": pdf_name,
    #     "schema_name": schema_name,
    #     "pdf_size_bytes": os.path.getsize(pdf_path)
    # })

    # Load schema JSON
    with open(schema_path, "r") as f:
        schema = json.load(f)

    # Prepare files and payload
    files = [
        ("pdf", (pdf_name, open(pdf_path, "rb"), "application/pdf")),
    ]
    payload = {"fields_schema": json.dumps(schema)}

    # Make the request
    response = requests.post(url, headers=headers, files=files, data=payload)
    response.raise_for_status()

    # Extract and return result
    print(response.json()["data"])
    output_data = response.json()["data"]
    return output_data["extracted_schema"]

# === Usage ===
if __name__ == "__main__":
    pdf_path = "../landing-ai-sandbox-data/input/test_pdfs/document_0.pdf"
    schema_path = "../schema/main_usage_schema.json"

    extracted_info = extract_schema_from_pdf(pdf_path, schema_path)
    print(json.dumps(extracted_info, indent=2))

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.va.landing.ai:443


document_0.pdf
main_usage_schema.json


DEBUG:urllib3.connectionpool:https://api.va.landing.ai:443 "POST /v1/tools/agentic-document-analysis HTTP/1.1" 206 234059


{'markdown': 'Mansfield Power and Gas, LLC\nMaster Commercial Gas Sales Agreement <!-- marginalia, from page 0 (l=0.117,t=0.040,r=0.453,b=0.073), with ID 72ca7bd2-c083-4bfb-9bf5-173e0a8f572b -->\n\nSummary : This image is a corporate logo for Mansfield, featuring stylized initials and a tagline.\n\nlogo: Mansfield Energy Simplified\n\nLogo Elements :\n  • Stylized "M" and "E" initials form a geometric, interlocking design on the left.\n  • To the right, the company name "Mansfield" appears in bold, sans-serif font.\n  • Below "Mansfield," the tagline reads: "Energy. Simplified."\n  • The entire logo is rendered in grayscale.\n\nDimensions & Placement :\n  • The logo is horizontally oriented, with the icon on the left and text on the right.\n  • Tagline is smaller and lighter than the company name, aligned beneath it.\n\nAnalysis :\n  • The logo uses a clean, modern design to convey professionalism and simplicity, aligning with the tagline "Energy. Simplified." <!-- figure, from page 0 