In [20]:
import time
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import ImageRefMode
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, EasyOcrOptions, TesseractOcrOptions, OcrMacOptions
from docling.datamodel.settings import settings
from pathlib import Path

In [21]:
output_dir = Path("/home/nurshed/Desktop/python/project/RAG Study/docling/adv_docling")

IMAGE_RESOLUTION_SCALE = 2.0

# Define pipeline options for PDF processing
pipeline_options = PdfPipelineOptions(
    do_table_structure=True,  # Enable table structure detection
    do_ocr=True,  # Enable OCR
    # full page ocr and language selection
    ocr_options=EasyOcrOptions(force_full_page_ocr=True, lang=["en"]),  # Use EasyOCR for OCR
    #ocr_options=TesseractOcrOptions(force_full_page_ocr=True, lang=["eng"]),  # Uncomment to use Tesseract for OCR
    #ocr_options = OcrMacOptions(force_full_page_ocr=True, lang=['en-US']),
    table_structure_options=dict(
        do_cell_matching=False,  # Use text cells predicted from table structure model
        mode=TableFormerMode.ACCURATE  # Use more accurate TableFormer model
    ),
    generate_page_images=True,  # Enable page image generation
    generate_picture_images=True,  # Enable picture image generation
    images_scale=IMAGE_RESOLUTION_SCALE, # Set image resolution scale (scale=1 corresponds to a standard 72 DPI image)
)

# Initialize the DocumentConverter with the specified pipeline options
doc_converter_global = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [26]:
# Enable the profiling to measure the time spent
settings.debug.profile_pipeline_timings = True

# Convert the document
#result = doc_converter_global.convert(FIRST_10_PAGES)
#result = doc_converter_global.convert(SOURCE)
result = doc_converter_global.convert("/home/nurshed/Desktop/python/project/RAG Study/docling/cv_demo.pdf")

output_dir.mkdir(parents=True, exist_ok=True)
#doc_filename = Path(FIRST_10_PAGES).stem
#doc_filename = Path(FIRST_10_PAGES).stem
doc_filename = Path("/home/nurshed/Desktop/python/project/RAG Study/docling/cv_demo.pdf").stem

# Save markdown with embedded pictures
#md_filename = output_dir / f"{doc_filename}.md"
md_filename = output_dir / f"{doc_filename}-with-images.md"
#result.document.save_as_markdown(md_filename) # just shows there is image at this point, <!-- image -->
#result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED) # image is embedded with base64
result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED) #artifacts folder is created with this

# _log.info(f"Markdown content has been saved to {md_filename}")

2025-10-23 17:38:51,749 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-23 17:38:51,753 - INFO - Going to convert document batch...
2025-10-23 17:38:51,754 - INFO - Processing document cv_demo.pdf
2025-10-23 17:40:49,518 - INFO - Finished converting document cv_demo.pdf in 117.77 sec.


In [19]:
# Enable the profiling to measure the time spent
settings.debug.profile_pipeline_timings = True

# Convert the document
#result = doc_converter_global.convert(FIRST_10_PAGES)
#result = doc_converter_global.convert(SOURCE)
result = doc_converter_global.convert("/home/nurshed/Desktop/python/project/RAG Study/docling/NIPS-2017-attention-is-all-you-need-Paper.pdf")

output_dir.mkdir(parents=True, exist_ok=True)
#doc_filename = Path(FIRST_10_PAGES).stem
#doc_filename = Path(FIRST_10_PAGES).stem
doc_filename = Path("/home/nurshed/Desktop/python/project/RAG Study/docling/NIPS-2017-attention-is-all-you-need-Paper.pdf").stem

# Save markdown with embedded pictures
#md_filename = output_dir / f"{doc_filename}.md"
md_filename = output_dir / f"{doc_filename}-with-images.md"
#result.document.save_as_markdown(md_filename) # just shows there is image at this point, <!-- image -->
#result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED) # image is embedded with base64
result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED) #artifacts folder is created with this

_log.info(f"Markdown content has been saved to {md_filename}")

2025-10-23 17:29:50,648 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-23 17:29:50,655 - INFO - Going to convert document batch...
2025-10-23 17:29:50,656 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 6722b4d67bf82aafc8858d5663f840fd
2025-10-23 17:29:50,658 - INFO - Accelerator device: 'cpu'
2025-10-23 17:29:53,224 - INFO - Accelerator device: 'cpu'
2025-10-23 17:29:55,023 - INFO - Accelerator device: 'cpu'
2025-10-23 17:29:56,036 - INFO - Processing document NIPS-2017-attention-is-all-you-need-Paper.pdf


KeyboardInterrupt: 

In [17]:
# Enable the profiling to measure the time spent
settings.debug.profile_pipeline_timings = True

# Convert the document
#result = doc_converter_global.convert(FIRST_10_PAGES)
#result = doc_converter_global.convert(SOURCE)
result = doc_converter_global.convert("/home/nurshed/Desktop/python/project/RAG Study/docling/KONE Sustainability_tcm117-105566.pdf")

output_dir.mkdir(parents=True, exist_ok=True)
#doc_filename = Path(FIRST_10_PAGES).stem
#doc_filename = Path(FIRST_10_PAGES).stem
doc_filename = Path("/home/nurshed/Desktop/python/project/RAG Study/docling/KONE Sustainability_tcm117-105566.pdf").stem

# Save markdown with embedded pictures
#md_filename = output_dir / f"{doc_filename}.md"
md_filename = output_dir / f"{doc_filename}-with-images.md"
#result.document.save_as_markdown(md_filename) # just shows there is image at this point, <!-- image -->
#result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED) # image is embedded with base64
result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED) #artifacts folder is created with this

_log.info(f"Markdown content has been saved to {md_filename}")

2025-10-23 17:16:11,637 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-23 17:16:11,687 - INFO - Going to convert document batch...
2025-10-23 17:16:11,688 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 6722b4d67bf82aafc8858d5663f840fd
2025-10-23 17:16:11,689 - INFO - Accelerator device: 'cpu'
2025-10-23 17:17:01,135 - INFO - Download complete
2025-10-23 17:17:04,473 - INFO - Download complete.
2025-10-23 17:17:06,588 - INFO - Accelerator device: 'cpu'
2025-10-23 17:17:08,240 - INFO - Accelerator device: 'cpu'
2025-10-23 17:17:08,758 - INFO - Processing document KONE Sustainability_tcm117-105566.pdf


KeyboardInterrupt: 