In [1]:
# Before calling the API, replace filename and ensure sdk is installed: "pip install unstructured-client"
# See https://docs.unstructured.io/api-reference/api-services/sdk for more details

import os
import json
import random
from unstructured_client.models import operations, shared
from dotenv import find_dotenv, load_dotenv
import unstructured_client

load_dotenv(find_dotenv())
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL")
LOCAL_FILE_INPUT_DIR = os.getenv("LOCAL_FILE_INPUT_DIR")
LOCAL_FILE_OUTPUT_DIR = os.getenv("LOCAL_FILE_OUTPUT_DIR")


print(UNSTRUCTURED_API_KEY, UNSTRUCTURED_API_URL)
print(LOCAL_FILE_INPUT_DIR, LOCAL_FILE_OUTPUT_DIR)
client = unstructured_client.UnstructuredClient(
    api_key_auth=UNSTRUCTURED_API_KEY,
    server_url=UNSTRUCTURED_API_URL,
)

J8GaQYE095R5ptsWmcvlrdTz7ycE8g https://api.unstructuredapp.io/general/v0/general
pdf_input_files pdf_json_output


In [2]:
filename = "pdf_input_files/test.pdf"
with open(filename, "rb") as f:
    data = f.read()
    print(data)

b'%PDF-1.7\n%\xbf\xf7\xa2\xfe\n1 0 obj\n<< /MarkInfo << /Marked true >> /Metadata 3 0 R /Names 4 0 R /OpenAction [ 5 0 R /FitH 804 ] /Outlines 6 0 R /PageLabels << /Nums [ 0 << /S /D /St 4 >> ] >> /PageLayout /SinglePage /PageMode /UseOutlines /Pages 7 0 R /StructTreeRoot 9 0 R /Threads 12 0 R /Type /Catalog >>\nendobj\n2 0 obj\n<< /Author (Omotayo Sanni) /CreationDate (D:20181229014113+05\'30\') /Creator (Elsevier) /CrossMarkDomains#5b1#5d (elsevier.com) /CrossMarkDomains#5b2#5d (sciencedirect.com) /CrossmarkDomainExclusive (true) /CrossmarkMajorVersionDate (2010-04-23) /ElsevierWebPDFSpecifications (6.5) /Keywords (Corrosion; Stainless steel; Inhibitor; Sulphuric acid) /ModDate (D:20181229014113+05\'30\') /Subject (Data in Brief, 22 \\(2018\\) 451-457. doi:10.1016/j.dib.2018.11.134) /Title (Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment) /doi (10.1016/j.dib.2018.11.134) /robots (noindex) >>\nendobj\n3 0 obj\n<< /Subtype /XML /Type 

In [3]:
req = operations.PartitionRequest(
    partition_parameters=shared.PartitionParameters(
        files=shared.Files(
            content=data,
            file_name=filename,
            strategy="hi_res",
            extract_images_in_pdf=True,
            infer_table_structure=True,
        ),
        # --- Other partition parameters ---
        # Note: Defining 'strategy', 'chunking_strategy', and 'output_format'
        # parameters as strings is accepted, but will not pass strict type checking. It is
        # advised to use the defined enum classes as shown below.
        strategy=shared.Strategy.HI_RES,
        languages=["eng"],
        infer_table_structure=True,
        extract_element_types=["Table"],
        extract_image_block_types=["Image", "Table"],
        extract_images_in_pdf=True,
    ),
)

output = []

try:
    res = client.general.partition(request=req)

except Exception as e:
    print(e)

INFO: HTTP Request: POST https://api.unstructuredapp.io/general/v0/general "HTTP/1.1 200 OK"


In [None]:
# convert str with ' to dict then json.loads()

In [4]:
def convert_single_to_double_quotes(s):
    return s.replace("'", '"')

In [None]:
def sanitize_unescaped_quotes_and_load_json_str(s: str, strict=False) -> dict:
    js_str = s
    prev_pos = -1
    curr_pos = 0
    while curr_pos > prev_pos:
        # after while check, move marker before we overwrite it
        prev_pos = curr_pos
        try:
            return json.loads(js_str, strict=strict)
        except json.JSONDecodeError as err:
            curr_pos = err.pos
            if curr_pos <= prev_pos:
                # previous change didn't make progress, so error
                raise err

            # find the previous " before e.pos
            prev_quote_index = js_str.rfind('"', 0, curr_pos)
            # escape it to \"
            js_str = js_str[:prev_quote_index] + "\\" + js_str[prev_quote_index:]

In [5]:
num_el = len(res.elements)

print(f"Number of items: {num_el}")

json_array = []

file = f"{LOCAL_FILE_OUTPUT_DIR}/unstructured_{random.randint(1000,9999)}.json"

with open(file, "a") as f:

    f.write("[")

    for i in range(num_el):

        item = convert_single_to_double_quotes(str(res.elements[i]))
        # need to remove trailing comma
        try:
            if i == num_el - 1:
                f.write(item)
            else:
                f.write(item + ",")
        except ValueError as ex:
            print(ex)
            f.write(",")

    f.write("]")

print(f"File saved as: {file}")

Number of items: 10
File saved as: pdf_json_output/unstructured_4563.json
