In [None]:
!pip install docling transformers

In [None]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"

In [2]:
from docling.document_converter import DocumentConverter
from pathlib import Path

def pdf_to_markdown(pdf_path: str, output_path: str):
    # Initialize Docling converter
    converter = DocumentConverter()
    
    # Convert the PDF
    result = converter.convert(pdf_path)
    
    # Extract Markdown
    markdown = result.document.export_to_markdown()
    
    # Save to file
    Path(output_path).write_text(markdown, encoding="utf-8")
    print(f"✅ Converted {pdf_path} -> {output_path}")

if __name__ == "__main__":
    pdf_to_markdown("sample.pdf", "output.md")


2025-09-18 15:11:10,110 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-09-18 15:11:18,532 - INFO - Going to convert document batch...
2025-09-18 15:11:18,533 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e647edf348883bed75367b22fbe60347
2025-09-18 15:11:18,534 - INFO - Accelerator device: 'cpu'
2025-09-18 15:11:20,228 - INFO - Accelerator device: 'cpu'
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
2025-09-18 15:11:36,306 - INFO - Accelerator device: 'cpu'
2025-09-18 15:11:36,669 - INFO - Processing document sample.pdf
2025-09-18 15:13:16,425 - INFO - Finished converting document sample.pdf in 126.33 sec.


✅ Converted sample.pdf -> output.md


In [4]:
import torch

In [9]:
import os
from pathlib import Path
import torch

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat

# For Windows symlink issue
os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"


def pdf_to_markdown(pdf_path: str, output_path: str):
    # pick device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # pipeline options: enable OCR etc if needed
    opts = PdfPipelineOptions()
    # use OCR only if needed
    opts.do_ocr = True  # or False if your PDF already has embedded text
    # tell it to use GPU
    opts.accelerator_options.device = device

    # Create converter
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
        }
    )

    # Convert
    result = converter.convert(pdf_path)

    md = result.document.export_to_markdown()

    Path(output_path).write_text(md, encoding="utf-8")
    print(f"✔️ Done: {pdf_path} -> {output_path}")


if __name__ == "__main__":
    pdf_to_markdown("sample.pdf", "output.md")

2025-09-18 15:16:21,602 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]


Using device: cpu


2025-09-18 15:16:30,506 - INFO - Going to convert document batch...
2025-09-18 15:16:30,507 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44750bf936770741b0779c5d88b7b475
2025-09-18 15:16:30,508 - INFO - Accelerator device: 'cpu'
2025-09-18 15:16:32,220 - INFO - Accelerator device: 'cpu'
2025-09-18 15:16:33,499 - INFO - Accelerator device: 'cpu'
2025-09-18 15:16:34,047 - INFO - Processing document sample.pdf


KeyboardInterrupt: 