In [1]:
import os

from dotenv import find_dotenv, load_dotenv
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.processes.connectors.local import (
    LocalConnectionConfig,
    LocalDownloaderConfig,
    LocalIndexerConfig,
    LocalUploaderConfig,
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig

In [None]:
load_dotenv(find_dotenv())
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL")
LOCAL_FILE_INPUT_DIR = os.getenv("LOCAL_FILE_INPUT_DIR")
LOCAL_FILE_OUTPUT_DIR = os.getenv("LOCAL_FILE_OUTPUT_DIR")


print(UNSTRUCTURED_API_KEY, UNSTRUCTURED_API_URL)
print(LOCAL_FILE_INPUT_DIR, LOCAL_FILE_OUTPUT_DIR)

In [3]:
# extract_element_types=["Table"],
# extract_image_block_types=["Image", "Table"],
# extract_image_block_output_dir=LOCAL_FILE_OUTPUT_DIR,


def generate_json_from_local(
    input_path: str,
    output_dir: str,
    parition_by_api: bool = True,
    api_key: str = None,
    partition_endpoint: str = None,
    split_pdf_page: bool = True,
    split_pdf_allow_failed: bool = True,
    split_pdf_concurrency_level: int = 15,
):

    Pipeline.from_configs(
        context=ProcessorConfig(),
        indexer_config=LocalIndexerConfig(input_path=LOCAL_FILE_INPUT_DIR),
        downloader_config=LocalDownloaderConfig(),
        source_connection_config=LocalConnectionConfig(),
        partitioner_config=PartitionerConfig(
            strategy="hi_res",
            chunking_strategy="by_title",
            infer_table_structure=True,
            unique_element_ids=True,
            extract_images_in_pdf=False,
            partition_by_api=parition_by_api,
            api_key=api_key,
            partition_endpoint=partition_endpoint,
            additional_partition_args={
                "split_pdf_page": split_pdf_page,
                "split_pdf_allow_failed": split_pdf_allow_failed,
                "split_pdf_concurrency_level": split_pdf_concurrency_level,
            },
        ),
        uploader_config=LocalUploaderConfig(output_dir=LOCAL_FILE_OUTPUT_DIR),
    ).run()

In [None]:
generate_json_from_local(
    input_path="embedded-images-tables.pdf",
    output_dir=".",
    parition_by_api=True,
    api_key=UNSTRUCTURED_API_KEY,
    partition_endpoint=UNSTRUCTURED_API_URL,
)