# Test usage and effect of Landing-AI: Agentic Doc - Directly from .pdf to expected json file

## Installation

In [10]:
pip install agentic-doc

Collecting agentic-doc
  Downloading agentic_doc-0.3.0-py3-none-any.whl.metadata (19 kB)
Collecting boto3<2.0.0,>=1.38.23 (from agentic-doc)
  Downloading boto3-1.39.4-py3-none-any.whl.metadata (6.6 kB)
Collecting google-api-python-client<3.0.0,>=2.170.0 (from agentic-doc)
  Downloading google_api_python_client-2.176.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth<3.0.0,>=2.40.2 (from agentic-doc)
  Downloading google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-auth-oauthlib<2.0.0,>=1.2.2 (from agentic-doc)
  Downloading google_auth_oauthlib-1.2.2-py3-none-any.whl.metadata (2.7 kB)
Collecting jsonschema<5.0.0,>=4.24.0 (from agentic-doc)
  Downloading jsonschema-4.24.0-py3-none-any.whl.metadata (7.8 kB)
Collecting opencv-python-headless<5.0.0.0,>=4.11.0.86 (from agentic-doc)
  Downloading opencv_python_headless-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl.metadata (19 kB)
Collecting pillow-heif>=0.17.0 (from agentic-doc)
  Downloading pillow_heif-1.0.0-

Assume that the `VISION_AGENT_API_KEY` was properly requested and added to env

In [1]:
import os
api_key = os.getenv("VISION_AGENT_API_KEY")

----

## Sample Code From Github

In [None]:
from pydantic import BaseModel, Field
from agentic_doc.parse import parse

class ExtractedFields(BaseModel):
    employee_name: str = Field(description="the full name of the employee")
    employee_ssn: str = Field(description="the social security number of the employee")
    gross_pay: float = Field(description="the gross pay of the employee")
    employee_address: str = Field(description="the address of the employee")

results = parse("mydoc.pdf", extraction_model=ExtractedFields)
fields = results[0].extraction
metadata = results[0].extraction_metadata
print(f"Field value: {fields.employee_name}, confidence: {metadata.employee_name.confidence}")

## Single File .pdf to .json extraction

In [5]:
import json
import requests

headers = {"Authorization": f"Basic {api_key}"}
url = "https://api.va.landing.ai/v1/tools/agentic-document-analysis"

base_pdf_path = "../landing-ai-sandbox-data/input/test_pdfs/"  # Replace with the path to the file
pdf_name = "document_0.pdf"  # Replace the file
pdf_path = f"{base_pdf_path}/{pdf_name}"

schema_path = "../schema/"
schema_name = "main_usage_schema.json"  # Replace with the JSON schema
schema_path = f"{schema_path}/{schema_name}"

with open(schema_path, "r") as file:
    schema = json.load(file)

files = [
    ("pdf", (pdf_name, open(pdf_path, "rb"), "application/pdf")),
]

payload = {"fields_schema": json.dumps(schema)}

response = requests.request("POST", url, headers=headers, files=files, data=payload)

output_data = response.json()["data"]
extracted_info = output_data["extracted_schema"]
print(extracted_info)

{'documentId': 'Mansfield Power and Gas, LLC Master Commercial Gas Sales Agreement', 'documentType': 'contract', 'statementDate': '2019-06-27', 'contractStart': '2019-08-01', 'contractEnd': '2020-07-31', 'customerName': 'Pangea Ventures, LLC', 'commodity': 'natural_gas', 'unit': 'therms', 'deliveryCharge': 0, 'supplyCharge': 0, 'taxCharge': 0, 'totalUsage': 2460000, 'deliveryRate': 0, 'supplyRate': 0.279, 'taxRate': 0, 'usageHistory': [{'month': 'Jan', 'usage': 500000}, {'month': 'Feb', 'usage': 450000}, {'month': 'Mar', 'usage': 300000}, {'month': 'Apr', 'usage': 200000}, {'month': 'May', 'usage': 100000}, {'month': 'June', 'usage': 65000}, {'month': 'July', 'usage': 35000}, {'month': 'Aug', 'usage': 35000}, {'month': 'Sept', 'usage': 55000}, {'month': 'Oct', 'usage': 75000}, {'month': 'Nov', 'usage': 250000}, {'month': 'Dec', 'usage': 400000}], 'locations': [{'accountNumber': '0603935855-00006', 'serviceAddress': '5250 W JACKSON BLVD, CHICAGO, IL 60644', 'meterNumber': None, 'commodi

In [24]:
print(json.dumps(response.json(), indent=2))

{
  "data": {
    "markdown": "Mansfield Power and Gas, LLC  \nMaster Commercial Gas Sales Agreement <!-- marginalia, from page 0 (l=0.117,t=0.040,r=0.453,b=0.073), with ID 5aa9c10a-4f9c-488d-a56e-bd22750a2614 -->\n\nlogo: Mansfield Energy, Simplified\n\nVisible Elements :\n  \u2022 Stylized \"M\" graphic on the left.\n  \u2022 Text \"Mansfield\" in bold to the right of the graphic.\n  \u2022 Tagline below \"Mansfield\" reads \"Energy, Simplified\".\n\nDesign Details :\n  \u2022 Monochrome (grayscale) color scheme.\n  \u2022 Horizontal layout: logo graphic on the left, text on the right.\n  \u2022 Tagline is in a lighter, smaller font beneath the main company name.\n\nAnalysis :\n  \u2022 The logo combines a bold, geometric \"M\" with clear, modern typography to convey a professional and streamlined brand identity for Mansfield, emphasizing simplicity in energy solutions. <!-- figure, from page 0 (l=0.746,t=0.037,r=0.886,b=0.073), with ID b9315488-b66b-4990-9bce-f4446541b939 -->\n\nTHI

In [9]:
!mkdir "../landing-ai-sandbox-data/direct_pdf_to_json"

In [17]:
with open("../landing-ai-sandbox-data/direct_pdf_to_json/document_0_extracted.json", "w") as file:
    json.dump(extracted_info, file, indent=4)

----

## Multiple files .pdf to .json extraction

In [31]:
def list_pdf_files(directory_path):
    pdf_files = [file for file in os.listdir(directory_path)
                 if file.lower().endswith(".pdf") and os.path.isfile(os.path.join(directory_path, file))]
    return pdf_files

Initial attempt

In [42]:
import random 

headers = {"Authorization": f"Basic {api_key}"}
url = "https://api.va.landing.ai/v1/tools/agentic-document-analysis"

base_pdf_path = "../landing-ai-sandbox-data/input/test_pdfs/"  # Replace with the path to the file
pdf_names = random.sample(list_pdf_files("../landing-ai-sandbox-data/input/test_pdfs/"), 10)  # Replace the file
pdf_path = f"{base_pdf_path}/{pdf_name}"

schema_path = "../schema/"
schema_name = "main_usage_schema.json"  # Replace with the JSON schema
schema_path = f"{schema_path}/{schema_name}"

with open(schema_path, "r") as file:
    schema = json.load(file)

files = [
    ("pdf", (pdf_name, open(pdf_path, "rb"), "application/pdf")) for pdf_name in pdf_names
]

payload = {"fields_schema": json.dumps(schema)}

response = requests.request("POST", url, headers=headers, files=files, data=payload)

output_data = response.json()["data"]
extracted_info = output_data["extracted_schema"]
print(extracted_info)

KeyError: 'data'

In [46]:
response.json()

{'message': 'Multiple PDF files detected (10). Please provide only one PDF file.'}

Unable to send 1 request for multiple PDF, need to split up the requests into individual file and send them per file

In [63]:
headers = {"Authorization": f"Basic {api_key}"}
url = "https://api.va.landing.ai/v1/tools/agentic-document-analysis"

base_pdf_path = "../landing-ai-sandbox-data/input/test_pdfs/"  # Replace with the path to the file
pdf_names = random.sample(list_pdf_files("../landing-ai-sandbox-data/input/test_pdfs/"), 10)  # Replace the file
pdf_path = f"{base_pdf_path}/{pdf_name}"

schema_path = "../schema/"
schema_name = "main_usage_schema.json"  # Replace with the JSON schema
schema_path = f"{schema_path}/{schema_name}"

with open(schema_path, "r") as file:
    schema = json.load(file)

In [73]:
master_files = [
    ("pdf", (pdf_name, open(f"{base_pdf_path}/{pdf_name}", "rb"), "application/pdf")) for pdf_name in pdf_names
]

In [77]:
import os

for file in master_files:
    files = [file] # only get one entry per loop

    payload = {"fields_schema": json.dumps(schema)}

    response = requests.request("POST", url, headers=headers, files=files, data=payload)

    output_data = response.json()["data"]
    extracted_info = output_data["extracted_schema"]
    
    file_name = os.path.splitext(file[1][0])[0]
    
    with open(f"../landing-ai-sandbox-data/direct_pdf_to_json/{file_name}_extracted.json", "w") as file:
        json.dump(extracted_info, file, indent=4)

KeyError: 'data'

In [85]:
response.json()

{'message': 'Failed to open PDF. Ensure it is a valid PDF file.'}

In [87]:
def list_exported_json_files(directory_path):
    json_files = [file for file in os.listdir(directory_path)
                 if file.lower().endswith(".json") and os.path.isfile(os.path.join(directory_path, file))]
    return json_files

list_exported_json_files('../landing-ai-sandbox-data/direct_pdf_to_json/')

['document_734_extracted.json',
 'document_0_extracted.json',
 'document_246_extracted.json',
 'document_760_extracted.json',
 'document_703_extracted.json',
 'document_287_extracted.json']

In [107]:
processed_file = list_exported_json_files('../landing-ai-sandbox-data/direct_pdf_to_json/')
processed_file = [os.path.splitext(file)[0] for file in processed_file]
processed_file = [f'{"_".join(file.split("_")[:2])}.pdf' for file in processed_file]
processed_file

['document_734.pdf',
 'document_0.pdf',
 'document_246.pdf',
 'document_760.pdf',
 'document_703.pdf',
 'document_287.pdf']

In [117]:
master_files = [
    ("pdf", (pdf_name, open(f"{base_pdf_path}/{pdf_name}", "rb"), "application/pdf")) for pdf_name in list(set(pdf_names) - set(processed_file))
]

In [119]:
for file in master_files:
    files = [file] # only get one entry per loop

    payload = {"fields_schema": json.dumps(schema)}

    response = requests.request("POST", url, headers=headers, files=files, data=payload)

    output_data = response.json()["data"]
    extracted_info = output_data["extracted_schema"]
    
    file_name = os.path.splitext(file[1][0])[0]
    
    with open(f"../landing-ai-sandbox-data/direct_pdf_to_json/{file_name}_extracted.json", "w") as file:
        json.dump(extracted_info, file, indent=4)

KeyError: 'data'

In [124]:
response.json()

{'message': 'Failed to open PDF. Ensure it is a valid PDF file.'}

---

## Cleanup

In [40]:
!rm -rf '../landing-ai-sandbox-data/input/test_pdfs'