In [1]:
import time
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorOptions
from docling.document_converter import PdfFormatOption
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
import warnings


warnings.filterwarnings("ignore", category=RuntimeWarning, module="numpy")
warnings.filterwarnings("ignore", category=RuntimeWarning, module="docling")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# pipeline_options = PdfPipelineOptions(
#     do_ocr=False,                         # heaviest thing that adds most latency
#     do_table_structure=False,             # main heavy processing (uses tableformer)
#     do_picture_description=False,
#     do_picture_classification=False,
#     generate_parsed_pages=False,
#     generate_page_images=False,
#     accelerator_options=AcceleratorOptions(device="cpu"), # do not touch this, we are simulating an azure function compute through this
# )

# we are basically allowing docling here to use LayoutModel which detects the layout of the page - headings, subheadings, tables etc.


In [6]:
pipeline_options = PdfPipelineOptions(
    do_ocr=True,
    do_table_structure=True,
    do_picture_description=True,
    do_picture_classification=True,
    generate_parsed_pages=True,
    generate_page_images=True,
    accelerator_options=AcceleratorOptions(device="mps"), # do not touch this, we are simulating an azure function compute through this
)

# we are basically allowing docling here to use LayoutModel which detects the layout of the page - headings, subheadings, tables etc.


In [7]:
converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            backend=PyPdfiumDocumentBackend,
            pipeline_options=pipeline_options,
        )
    }
)
converter.initialize_pipeline(InputFormat.PDF)

In [8]:
res = converter.convert("robinhood.pdf")
robinhood_result = res.document.export_to_markdown()

In [9]:
print(robinhood_result)
with open("robinhood.md", "w") as f:
    f.write(robinhood_result)

## Pending

NVIDIA Market Buy

Individual · Jul 3 , 2025

Pending

## Recent

| UnitedHealth Market Buy Individual · Jun 24         | $8 . 84 029325 shares at $301 . 44               |
|-----------------------------------------------------|--------------------------------------------------|
| DOGE Market Buy Jun 23                              | $2 , 434 . 44 000 . 00 Dogecoin at $0 . 161167   |
| PEPE Market Buy Jun 23                              | $977. 7. 80 00 Pepe at $0 . 00000971             |
| SOL Market Buy Jun 23                               | $2 , 885 . 26 00 Solana at $143 . 26             |
| PEPE Market Sell Jun 23                             | $947. 7. 32 00 Pepe at $0 00000954               |
| SOL Market Sell Jun 23                              | $2 , 801 . 25 00 Solana at $141 05               |
| DOGE Market Sell Jun 23                             | $2 , 367. 7. 03 000 . 00 Dogecoin at $0 . 158915 |
| Alphabet Class A Market Buy Individual · Jun 16     | $1 . 26 00

In [17]:
# simple run (non benchmark)
import json


res = converter.convert("image_document.pdf")
doc = res.document.export_to_dict()
# print(doc)
json.dumps(doc, indent=2)

'{\n  "schema_name": "DoclingDocument",\n  "version": "1.3.0",\n  "name": "image_document",\n  "origin": {\n    "mimetype": "application/pdf",\n    "binary_hash": 17013456081102711072,\n    "filename": "image_document.pdf"\n  },\n  "furniture": {\n    "self_ref": "#/furniture",\n    "children": [],\n    "content_layer": "furniture",\n    "name": "_root_",\n    "label": "unspecified"\n  },\n  "body": {\n    "self_ref": "#/body",\n    "children": [\n      {\n        "$ref": "#/texts/0"\n      },\n      {\n        "$ref": "#/texts/1"\n      },\n      {\n        "$ref": "#/texts/2"\n      },\n      {\n        "$ref": "#/texts/3"\n      },\n      {\n        "$ref": "#/pictures/0"\n      },\n      {\n        "$ref": "#/pictures/1"\n      },\n      {\n        "$ref": "#/texts/4"\n      },\n      {\n        "$ref": "#/texts/5"\n      },\n      {\n        "$ref": "#/texts/6"\n      },\n      {\n        "$ref": "#/texts/7"\n      },\n      {\n        "$ref": "#/groups/0"\n      },\n      {\n    

In [5]:
def benchmark(pdf_name, tries=30):
    times = []
    num_runs = 30
    
    for run in range(num_runs):
        start_time = time.time()
        res = converter.convert(
            pdf_name,
        )
        end_time = time.time()
        
        processing_time = end_time - start_time
        times.append(processing_time)
        print(f"Run {run + 1}: {processing_time:.2f} seconds")
        
    # Calculate statistics
    avg_time = sum(times) / len(times)
    min_time = min(times)
    max_time = max(times)
    
    print(f"\nBenchmark Results ({num_runs} runs):")
    print(f"Average time: {avg_time:.2f} seconds")
    print(f"Min time: {min_time:.2f} seconds")
    print(f"Max time: {max_time:.2f} seconds")

In [None]:
benchmark("text_images_1.docx")

# i am averaging 2.35 seconds / 5 pages with pdf = 470ms / page

In [None]:
benchmark("table_document.pdf")

# i am averaging 4.76 seconds / 11 pages with pdf = 430ms / page