In [1]:
import os

from dotenv import find_dotenv, load_dotenv
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.processes.connectors.local import (
    LocalConnectionConfig,
    LocalDownloaderConfig,
    LocalIndexerConfig,
    LocalUploaderConfig,
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig

In [2]:
load_dotenv(find_dotenv())
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
UNSTRUCTURED_API_URL = os.getenv("UNSTRUCTURED_API_URL")
LOCAL_FILE_INPUT_DIR = os.getenv("LOCAL_FILE_INPUT_DIR")
LOCAL_FILE_OUTPUT_DIR = os.getenv("LOCAL_FILE_OUTPUT_DIR")


print(UNSTRUCTURED_API_KEY, UNSTRUCTURED_API_URL)
print(LOCAL_FILE_INPUT_DIR, LOCAL_FILE_OUTPUT_DIR)

UCbRHhbNcGKu3qnzM05GXmuZ9irPoN https://api.unstructuredapp.io/general/v0/general
pdf_input_files pdf_json_output


In [3]:
# extract_element_types=["Table"],
# extract_image_block_types=["Image", "Table"],


def generate_json_from_local(
    input_path: str,
    output_dir: str,
    parition_by_api: bool = True,
    api_key: str = None,
    partition_endpoint: str = None,
    split_pdf_page: bool = True,
    split_pdf_allow_failed: bool = True,
    split_pdf_concurrency_level: int = 15,
):


    Pipeline.from_configs(
        context=ProcessorConfig(),
        indexer_config=LocalIndexerConfig(input_path=input_path),
        downloader_config=LocalDownloaderConfig(),
        source_connection_config=LocalConnectionConfig(),
        partitioner_config=PartitionerConfig(
            strategy="hi_res",
            infer_table_structure=True,
            unique_element_ids=True,
            extract_image_block_output_dir=LOCAL_FILE_OUTPUT_DIR,
            extract_images_in_pdf=True,
            partition_by_api=parition_by_api,
            api_key=api_key,
            partition_endpoint=partition_endpoint,

            additional_partition_args={
                "split_pdf_page": split_pdf_page,
                "split_pdf_allow_failed": split_pdf_allow_failed,
                "split_pdf_concurrency_level": split_pdf_concurrency_level,
            },
        ),
        uploader_config=LocalUploaderConfig(output_dir=output_dir),
    ).run()

In [4]:
generate_json_from_local(
    input_path="TMCB_43_2256640.pdf",
    output_dir=".",
    parition_by_api=True,
    api_key=UNSTRUCTURED_API_KEY,
    partition_endpoint=UNSTRUCTURED_API_URL,
)

2024-12-25 14:33:15,743 MainProcess INFO     created index with configs: {"input_path": "TMCB_43_2256640.pdf", "recursive": false}, connection configs: {"access_config": "**********"}
2024-12-25 14:33:15,744 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-12-25 14:33:15,745 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_allow_failed": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "element_exclude": [], "metadata_include": [], "partition_endpoint": "https://api.unstructuredapp.io/general/v0/general", "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-12-25 14:33:15,746 MainProcess INFO     