Setup unstructured-api local environment with docker container

`docker run -p 8000:8000 -d --rm --name unstructured-api downloads.unstructured.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0`

In [1]:
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared

client = UnstructuredClient(
    server_url="http://localhost:8000",
    api_key_auth="", #no need to authorize this parameter cause you don't use SASS api key.
)

In [2]:
filename = "example/multi-column.pdf"

with open(filename, "rb") as f:
    files = shared.Files(
        content=f.read(),
        file_name=filename,
    )

In [3]:
req = shared.PartitionParameters(
    files=files,
    chunking_strategy="by_title",
    strategy='hi_res',
    split_pdf_page=True,
    coordinates=True, ## this is just example. but if you want split_pdf_page, recommand to use hi_res strategy.
)

In [4]:
try:
    response = client.general.partition(req)
    print("Handled results :", len(response.elements))
except Exception as e:
    print("Exception :", e)

INFO: Splitting PDF by page on client. Using 5 threads when calling API.
INFO: Set UNSTRUCTURED_CLIENT_SPLIT_CALL_THREADS env var if you want to change that.
Handled results : 171


In [17]:
## pipeline data and caching
import os
import pickle

def save_data_cache(filename, data, path="pkl"):
    base_filename = os.path.basename(filename)
    base_filename = os.path.splitext(base_filename)[0]
    base_filename = base_filename + '.pkl'

    full_path = os.path.join(path, base_filename)

    if not os.path.exists(path):
        os.makedirs(path)

    with open(full_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"Data saved to {full_path}")

In [20]:
def load_data_cache(filename, path="pkl"):
    full_path = os.path.join(path, filename)

    if not os.path.exists(full_path):
        raise FileNotFoundError(f"No such file or directory: '{full_path}'")

    with open(full_path, 'rb') as file:
        data = pickle.load(file)
    print(f"Data loaded from {full_path}")
    return data

In [18]:
save_data_cache(filename=filename, data=response)

Data saved to pkl\multi-column.pkl


In [22]:
data = load_data_cache(filename="multi-column.pkl")

Data loaded from pkl\multi-column.pkl
