In [None]:
import time
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions
from docling.document_converter import PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
import warnings


warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")
warnings.filterwarnings("ignore", category=RuntimeWarning, module="docling")

In [2]:
pipeline_options = PdfPipelineOptions(
    do_ocr=False,                         # heaviest thing that adds most latency
    do_table_structure=False,             # main heavy processing (uses tableformer)
    do_picture_description=False,
    do_picture_classification=False,
    generate_parsed_pages=False,
    generate_page_images=False,
    accelerator_options=AcceleratorOptions(device="cpu"), # do not touch this, we are simulating an azure function compute through this
)

# we are basically allowing docling here to use LayoutModel which detects the layout of the page - headings, subheadings, tables etc.


In [3]:
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            backend=PyPdfiumDocumentBackend,
            pipeline_options=pipeline_options,
        )
    }
)
converter.initialize_pipeline(InputFormat.PDF)

In [None]:
# simple run (non benchmark)

# res = converter.convert("image_document.pdf")
# doc = res.document

In [5]:
def benchmark(pdf_name, tries=30):
    times = []
    num_runs = 30
    
    for run in range(num_runs):
        start_time = time.time()
        res = converter.convert(
            pdf_name,
        )
        end_time = time.time()
        
        processing_time = end_time - start_time
        times.append(processing_time)
        print(f"Run {run + 1}: {processing_time:.2f} seconds")
        
    # Calculate statistics
    avg_time = sum(times) / len(times)
    min_time = min(times)
    max_time = max(times)
    
    print(f"\nBenchmark Results ({num_runs} runs):")
    print(f"Average time: {avg_time:.2f} seconds")
    print(f"Min time: {min_time:.2f} seconds")
    print(f"Max time: {max_time:.2f} seconds")

In [8]:
benchmark("text_images_1.docx")

# i am averaging 2.35 seconds / 5 pages with pdf = 470ms / page

Run 1: 0.09 seconds
Run 2: 0.08 seconds
Run 3: 0.08 seconds
Run 4: 0.08 seconds
Run 5: 0.08 seconds
Run 6: 0.08 seconds
Run 7: 0.08 seconds
Run 8: 0.08 seconds
Run 9: 0.08 seconds
Run 10: 0.08 seconds
Run 11: 0.08 seconds
Run 12: 0.08 seconds
Run 13: 0.11 seconds
Run 14: 0.08 seconds
Run 15: 0.09 seconds
Run 16: 0.09 seconds
Run 17: 0.08 seconds
Run 18: 0.08 seconds
Run 19: 0.08 seconds
Run 20: 0.08 seconds
Run 21: 0.08 seconds
Run 22: 0.08 seconds
Run 23: 0.08 seconds
Run 24: 0.08 seconds
Run 25: 0.08 seconds
Run 26: 0.08 seconds
Run 27: 0.08 seconds
Run 28: 0.08 seconds
Run 29: 0.08 seconds
Run 30: 0.08 seconds

Benchmark Results (30 runs):
Average time: 0.08 seconds
Min time: 0.08 seconds
Max time: 0.11 seconds


In [None]:
benchmark("table_document.pdf")

# i am averaging 4.76 seconds / 11 pages with pdf = 430ms / page