In [1]:
import os
import sys
sys.path.append('..')
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig,
    LocalUploaderConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from src.utils.paths import get_project_path

In [None]:
def local_parse_data(directory_with_pdfs: str, directory_with_results: str):
    
    pdf_files = []
    for root, dirs, files in os.walk(directory_with_pdfs):
        for file in files:
            if file.endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))

    for pdf_file in pdf_files:
        Pipeline.from_configs(
            context=ProcessorConfig(
                tqdm=True,
                num_processes=30,
                device="cuda"
            ),
            indexer_config=LocalIndexerConfig(input_path=pdf_file),
            downloader_config=LocalDownloaderConfig(),
            source_connection_config=LocalConnectionConfig(),
            partitioner_config=PartitionerConfig(
                strategy="ocr_only",
                languages=["eng", "rus"],
                additional_partition_args={
                    "preserve_formatting": True,
                    "split_pdf_page": True,
                    "split_pdf_concurrency_level": 15,
                    "include_page_breaks": True,
                    "max_partition": 1500
                    },
                ),
            uploader_config=LocalUploaderConfig(output_dir=directory_with_results)
        ).run()

In [None]:
directory_with_pdfs = os.path.join(get_project_path(), 'data', 'Статьи для базы данных')
directory_with_results = os.path.join(get_project_path(), 'data', 'parsed_pages')

local_parse_data(directory_with_pdfs, directory_with_results)