In [32]:
import json
import logging
from pathlib import Path
import yaml
import logging
import time
import os
from glob import glob

from rich.console import Console
from rich.panel import Panel

In [33]:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)

from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline

from docling.datamodel.base_models import InputFormat
from docling.datamodel import vlm_model_specs
from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
    PdfPipelineOptions,

)

from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

from docling_core.transforms.serializer.html import HTMLDocSerializer
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer

from docling.chunking import HybridChunker


In [34]:
_log = logging.getLogger(__name__)

In [35]:
import os
from pathlib import Path

def get_single_input_file(folder_path, filename, extension=[".docx", ".pdf"]):
    """
    Fetch a single input file by (partial) name.
    Tries exact match first, then partial match.
    """
    # Exact match check
    for ext in extension:
        file_path = os.path.join(folder_path, filename + ext)
        if os.path.exists(file_path):
            return file_path
    
    # Partial match fallback
    folder = Path(folder_path)
    for f in folder.iterdir():
        if f.suffix.lower() in extension and filename.lower() in f.stem.lower():
            return str(f)
    
    return None


In [36]:

def get_output_path(input_file, base_output="Output"):
    parent_folder = os.path.basename(os.path.dirname(input_file))   # e.g., "DOCX Files"
    file_stem = os.path.splitext(os.path.basename(input_file))[0]   # e.g., "Sample-1"
    
    output_dir = Path(base_output) / parent_folder / file_stem
    output_dir.mkdir(parents=True, exist_ok=True)
    
    return output_dir


In [37]:
def SimpleConversion():
    source = "Master Approval Ltr (1).pdf"

    converter = DocumentConverter()
    result = converter.convert(source)
    doc = result.document

    json_output = json.dumps(doc.export_to_dict(), indent=4)
    print(json_output)

SimpleConversion()


INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 130b240947e51886114a732a44356305
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Ltr (1).pdf
INFO:docling.document_converter:Finished converting document Master Approval Ltr (1).pdf in 3.40 sec.


{
    "schema_name": "DoclingDocument",
    "version": "1.5.0",
    "name": "Master Approval Ltr (1)",
    "origin": {
        "mimetype": "application/pdf",
        "binary_hash": 2077827355875865154,
        "filename": "Master Approval Ltr (1).pdf"
    },
    "furniture": {
        "self_ref": "#/furniture",
        "children": [],
        "content_layer": "furniture",
        "name": "_root_",
        "label": "unspecified"
    },
    "body": {
        "self_ref": "#/body",
        "children": [
            {
                "$ref": "#/texts/0"
            },
            {
                "$ref": "#/texts/1"
            },
            {
                "$ref": "#/texts/2"
            },
            {
                "$ref": "#/groups/0"
            },
            {
                "$ref": "#/pictures/0"
            },
            {
                "$ref": "#/pictures/1"
            },
            {
                "$ref": "#/texts/9"
            },
            {
                "$r

In [10]:
def MultiFormat():
    input_paths = [
        get_single_input_file("Data/DOCX Files", "Master Approval Ltr (1)"),
    ]

    # Filter out None values (e.g., if file not found or user canceled selection)
    input_paths = [p for p in input_paths if p is not None]

    if not input_paths:
        raise FileNotFoundError("No valid input files were selected!")

    doc_converter = DocumentConverter(
        allowed_formats=[InputFormat.DOCX, InputFormat.PDF],
        format_options={
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline),
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=StandardPdfPipeline,
                backend=PyPdfiumDocumentBackend
            ),
        },
    )

    conv_results = doc_converter.convert_all(input_paths)

    for res in conv_results:
        file_ext = res.input.file.suffix.lower().lstrip('.')
        
        if file_ext == 'docx':
            base_output = "DOCX Files"
        elif file_ext == 'pdf':
            base_output = "PDF Files"
        else:
            base_output = "Output"

        out_path = get_output_path(res.input.file, base_output="Output/" + base_output)

        print(
            f"Document {res.input.file.name} converted."
            f"\nSaved markdown output to: {out_path!s}"
        )
        
        _log.debug(res.document._export_to_indented_text(max_text_len=16))
        
        json_path = out_path / f"{res.input.file.stem}.json"
        with json_path.open("w") as fp:
            fp.write(json.dumps(res.document.export_to_dict(), indent=4))

MultiFormat()

Document Master Approval Ltr (1).docx converted.
Saved markdown output to: Output/DOCX Files/Master Approval Ltr (1)


In [21]:
def FigureExport():
    IMAGE_RESOLUTION_SCALE = 2.0
    logging.basicConfig(level=logging.INFO)

    input_doc_path = get_single_input_file("Data/PDF Files", "Master Approval Ltr (1)")
    output_dir = get_output_path(input_doc_path, base_output="Output")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline) # Will try with PdfPipelineOptions()
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Right now removed saving as REFERENCED as it was creating RECURSIVE FOLDER
    # https://docling-project.github.io/docling/examples/export_figures/
    
    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(
        md_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save json with embedded pictures
    json_filename = output_dir / f"{doc_filename}-with-images.json"
    conv_res.document.save_as_json(
        json_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save HTML with embedded pictures
    html_filename = output_dir / f"{doc_filename}-with-images.html"
    conv_res.document.save_as_html(
        html_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

FigureExport()

INFO:docling.datamodel.document:detected formats: [<InputFormat.DOCX: 'docx'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Ltr (1).docx
INFO:docling.document_converter:Finished converting document Master Approval Ltr (1).docx in 0.05 sec.


AttributeError: 'NoneType' object has no attribute 'save'

In [19]:
import time
import logging
from pathlib import Path

def FigureExport():
    IMAGE_RESOLUTION_SCALE = 2.0
    logging.basicConfig(level=logging.INFO)

    input_doc_path = get_single_input_file("Data/PDF Files", "Master Approval Ltr (1)")
    output_dir = get_output_path(input_doc_path, base_output="Output-2")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline) # Will try with PdfPipelineOptions()
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Right now removed saving as REFERENCED as it was creating RECURSIVE FOLDER
    # https://docling-project.github.io/docling/examples/export_figures/
    
    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(
        md_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save json with embedded pictures
    json_filename = output_dir / f"{doc_filename}-with-images.json"
    conv_res.document.save_as_json(
        json_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save HTML with embedded pictures
    html_filename = output_dir / f"{doc_filename}-with-images.html"
    conv_res.document.save_as_html(
        html_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Then post-process to add your custom CSS
    with open(html_filename, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # Define custom CSS
    custom_css = '''
<style>
    /* Custom CSS for Molina Healthcare Authorization Letter */
    body {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 11pt !important;
        line-height: 1.2 !important;
        margin: 0 !important;
        padding: 0.5in !important;
        background-color: #ffffff !important;
        color: #000000 !important;
        max-width: 8.5in !important;
        margin: 0 auto !important;
    }

    /* Page setup */
    .page, main, .docling-document {
        width: 100% !important;
        max-width: 8.5in !important;
        min-height: 11in !important;
        margin: 0 !important;
        padding: 0 !important;
        page-break-after: always;
        position: relative;
    }

    .page:last-child, main:last-child {
        page-break-after: auto !important;
    }

    /* Header section with recipient info */
    .header-section {
        margin-bottom: 20px !important;
        font-size: 10pt !important;
        line-height: 1.1 !important;
    }

    /* Reset and override default styles */
    h1, h2, h3, h4, h5, h6 {
        font-family: Arial, Helvetica, sans-serif !important;
        margin-top: 15px !important;
        margin-bottom: 10px !important;
        font-weight: bold !important;
        color: #000000 !important;
    }

    h1 {
        text-align: center !important;
        font-size: 14pt !important;
        text-transform: uppercase !important;
        margin: 20px 0 !important;
    }

    p {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 11pt !important;
        line-height: 1.4 !important;
        margin-bottom: 12px !important;
        margin-top: 0 !important;
        text-align: justify !important;
        color: #000000 !important;
    }

    /* First few paragraphs are typically addresses */
    p:first-child, p:nth-child(2), p:nth-child(3), p:nth-child(4) {
        font-size: 10pt !important;
        line-height: 1.1 !important;
        margin-bottom: 5px !important;
        text-align: left !important;
    }

    /* ATTN line formatting */
    p:first-child {
        font-weight: bold !important;
        margin-bottom: 8px !important;
    }

    /* Service table */
    table {
        width: 100% !important;
        border-collapse: collapse !important;
        margin: 15px 0 !important;
        font-size: 10pt !important;
        font-family: Arial, Helvetica, sans-serif !important;
    }

    th {
        background-color: #f0f0f0 !important;
        border: 1px solid #000 !important;
        padding: 8px 6px !important;
        text-align: left !important;
        font-weight: bold !important;
        font-size: 10pt !important;
        font-family: Arial, Helvetica, sans-serif !important;
    }

    td {
        border: 1px solid #000 !important;
        padding: 8px 6px !important;
        vertical-align: top !important;
        font-size: 10pt !important;
        font-family: Arial, Helvetica, sans-serif !important;
    }

    /* List formatting */
    ul, ol {
        margin: 10px 0 !important;
        padding-left: 30px !important;
    }

    li {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 11pt !important;
        line-height: 1.4 !important;
        margin-bottom: 5px !important;
    }

    /* Images */
    img {
        max-width: 100% !important;
        height: auto !important;
        display: block !important;
        margin: 10px auto !important;
    }

    /* Figure containers */
    figure {
        text-align: center !important;
        margin: 20px 0 !important;
        page-break-inside: avoid !important;
    }

    figcaption {
        font-style: italic !important;
        font-size: 10pt !important;
        color: #666 !important;
        margin-top: 5px !important;
    }

    /* Merge field placeholders */
    .merge-field {
        background-color: #ffffcc !important;
        padding: 1px 3px !important;
        border: 1px dashed #ccc !important;
        font-style: italic !important;
    }

    /* Contact information styling */
    p:contains("Member Services"), p:contains("TTY:") {
        font-size: 10pt !important;
        margin: 15px 0 !important;
    }

    /* Footer elements */
    p:contains("UM_CCT_") {
        font-size: 8pt !important;
        color: #666 !important;
        margin: 5px 0 !important;
        padding-top: 10px !important;
        border-top: 1px solid #ccc !important;
    }

    /* Print-specific styles */
    @media print {
        body {
            margin: 0 !important;
            padding: 0.5in !important;
            font-size: 10pt !important;
        }
        
        .page, main {
            page-break-after: always !important;
            margin: 0 !important;
        }
        
        .page:last-child, main:last-child {
            page-break-after: auto !important;
        }

        * {
            -webkit-print-color-adjust: exact !important;
            color-adjust: exact !important;
        }
    }

    /* Responsive adjustments */
    @media screen and (max-width: 8.5in) {
        body {
            padding: 0.25in !important;
            font-size: 10pt !important;
        }
    }

    /* Override any default Docling styles */
    * {
        box-sizing: border-box !important;
    }

    div {
        font-family: Arial, Helvetica, sans-serif !important;
    }
</style>
'''

    # Insert custom CSS or replace existing styles
    enhanced_html = html_content.replace('</head>', f'{custom_css}</head>')
    
    # Highlight merge fields in curly braces
    import re
    enhanced_html = re.sub(r'\{([^}]+)\}', r'<span class="merge-field">{\1}</span>', enhanced_html)

    # Save the enhanced HTML
    with open(html_filename, 'w', encoding='utf-8') as f:
        f.write(enhanced_html)
    
    end_time = time.time() - start_time

    logging.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

# Call the function
FigureExport()

INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 45e823ad9aa4b6fa53c56667a4a8e97c
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Ltr (1).pdf
INFO:docling.document_converter:Finished converting document Master Approval Ltr (1).pdf in 3.55 sec.
INFO:root:Document converted and figures exported in 3.79 seconds.


In [21]:
console = Console(width=210)  # for preventing Markdown table wrapped rendering
DOC_SOURCE = get_single_input_file("Data/DOCX Files", "Master Approval Ltr (1)")

def print_in_console(text):
    console.print(Panel(text))

converter = DocumentConverter()
doc = converter.convert(source=DOC_SOURCE).document

# Serialize to HTML
# serializer = HTMLDocSerializer(doc=doc)

# Serialize to Markdown
serializer = MarkdownDocSerializer(doc=doc)
ser_result = serializer.serialize()
ser_text = ser_result.text

# we here only print an excerpt to keep the output brief:
print_in_console(ser_text)

INFO:docling.datamodel.document:detected formats: [<InputFormat.DOCX: 'docx'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Ltr (1).docx
INFO:docling.document_converter:Finished converting document Master Approval Ltr (1).docx in 0.05 sec.


In [22]:
import time
import logging
import json
import re
from pathlib import Path

def json_to_html_converter(json_file_path, output_html_path):
    """
    Convert JSON file to HTML with custom Molina Healthcare styling
    """
    # Load the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as f:
        doc_data = json.load(f)
    
    # Custom CSS for Molina Healthcare format
    custom_css = '''
<style>
    /* Custom CSS for Molina Healthcare Authorization Letter */
    body {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 11pt !important;
        line-height: 1.2 !important;
        margin: 0 !important;
        padding: 0.5in !important;
        background-color: #ffffff !important;
        color: #000000 !important;
        max-width: 8.5in !important;
        margin: 0 auto !important;
    }

    /* Page setup */
    .page {
        width: 100% !important;
        max-width: 8.5in !important;
        min-height: 11in !important;
        margin: 0 !important;
        padding: 0 !important;
        page-break-after: always;
        position: relative;
    }

    .page:last-child {
        page-break-after: auto !important;
    }

    /* Headers */
    h1, h2, h3, h4, h5, h6 {
        font-family: Arial, Helvetica, sans-serif !important;
        margin-top: 15px !important;
        margin-bottom: 10px !important;
        font-weight: bold !important;
        color: #000000 !important;
    }

    h1 {
        text-align: center !important;
        font-size: 14pt !important;
        text-transform: uppercase !important;
        margin: 20px 0 !important;
    }

    /* Paragraphs */
    p {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 11pt !important;
        line-height: 1.4 !important;
        margin-bottom: 12px !important;
        margin-top: 0 !important;
        text-align: justify !important;
        color: #000000 !important;
    }

    /* First few paragraphs (addresses) */
    .address-block {
        font-size: 10pt !important;
        line-height: 1.1 !important;
        margin-bottom: 5px !important;
        text-align: left !important;
    }

    .attn-line {
        font-weight: bold !important;
        margin-bottom: 8px !important;
    }

    /* Tables */
    table {
        width: 100% !important;
        border-collapse: collapse !important;
        margin: 15px 0 !important;
        font-size: 10pt !important;
        font-family: Arial, Helvetica, sans-serif !important;
    }

    th {
        background-color: #f0f0f0 !important;
        border: 1px solid #000 !important;
        padding: 8px 6px !important;
        text-align: left !important;
        font-weight: bold !important;
        font-size: 10pt !important;
    }

    td {
        border: 1px solid #000 !important;
        padding: 8px 6px !important;
        vertical-align: top !important;
        font-size: 10pt !important;
    }

    /* Lists */
    ul, ol {
        margin: 10px 0 !important;
        padding-left: 30px !important;
    }

    li {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 11pt !important;
        line-height: 1.4 !important;
        margin-bottom: 5px !important;
    }

    /* Images */
    img {
        max-width: 100% !important;
        height: auto !important;
        display: block !important;
        margin: 10px auto !important;
    }

    /* Figure containers */
    .figure-container {
        text-align: center !important;
        margin: 20px 0 !important;
        page-break-inside: avoid !important;
    }

    .figure-caption {
        font-style: italic !important;
        font-size: 10pt !important;
        color: #666 !important;
        margin-top: 5px !important;
    }

    /* Merge field highlighting */
    .merge-field {
        background-color: #ffffcc !important;
        padding: 1px 3px !important;
        border: 1px dashed #ccc !important;
        font-style: italic !important;
    }

    /* Print styles */
    @media print {
        body {
            margin: 0 !important;
            padding: 0.5in !important;
            font-size: 10pt !important;
        }
        
        .page {
            page-break-after: always !important;
            margin: 0 !important;
        }
        
        .page:last-child {
            page-break-after: auto !important;
        }

        * {
            -webkit-print-color-adjust: exact !important;
            color-adjust: exact !important;
        }
    }
</style>
'''
    
    def highlight_merge_fields(text):
        """Highlight merge fields in curly braces"""
        return re.sub(r'\{([^}]+)\}', r'<span class="merge-field">{\1}</span>', text)
    
    def process_element(element):
        """Process individual document elements from JSON"""
        element_type = element.get('type', '')
        text = element.get('text', '').strip()
        
        if not text and element_type not in ['picture', 'table']:
            return ''
        
        # Apply merge field highlighting
        text = highlight_merge_fields(text)
        
        # Handle different element types
        if element_type == 'title':
            level = element.get('level', 1)
            return f'<h{level}>{text}</h{level}>'
        
        elif element_type == 'paragraph':
            # Check if it's an address line (first few paragraphs usually)
            if text.startswith('ATTN:'):
                return f'<p class="address-block attn-line">{text}</p>'
            elif len(text) < 100 and any(keyword in text.lower() for keyword in ['street', 'ave', 'road', 'suite', 'ca ', 'member name:', 'member id']):
                return f'<p class="address-block">{text}</p>'
            else:
                return f'<p>{text}</p>'
        
        elif element_type == 'list_item':
            return f'<li>{text}</li>'
        
        elif element_type == 'picture':
            # Handle embedded images
            image_data = element.get('image', {})
            if image_data:
                image_b64 = image_data.get('image', '')
                if image_b64:
                    caption = element.get('caption', {}).get('text', '') if element.get('caption') else ''
                    return f'''
                    <div class="figure-container">
                        <img src="data:image/png;base64,{image_b64}" alt="{caption}">
                        {f'<div class="figure-caption">{caption}</div>' if caption else ''}
                    </div>
                    '''
        
        elif element_type == 'table':
            # Handle tables
            table_data = element.get('data', {})
            if table_data:
                html_table = '<table>'
                
                # Check if table has proper structure
                if isinstance(table_data, dict) and 'table_cells' in table_data:
                    cells = table_data['table_cells']
                    # Process table cells (this is a simplified approach)
                    html_table += '<tr>'
                    for cell in cells:
                        cell_text = cell.get('text', '')
                        html_table += f'<td>{highlight_merge_fields(cell_text)}</td>'
                    html_table += '</tr>'
                
                html_table += '</table>'
                return html_table
        
        else:
            # Default paragraph formatting
            return f'<p>{text}</p>'
        
        return ''
    
    # Start building HTML
    html_content = f'''
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Molina Healthcare - Notice of Authorization</title>
        {custom_css}
    </head>
    <body>
        <div class="page">
    '''
    
    # Process main text elements
    main_text = doc_data.get('main_text', [])
    current_list_type = None
    
    for element in main_text:
        element_html = process_element(element)
        
        if element_html:
            element_type = element.get('type', '')
            
            # Handle list grouping
            if element_type == 'list_item':
                if current_list_type != 'list':
                    html_content += '<ul>'
                    current_list_type = 'list'
                html_content += element_html
            else:
                if current_list_type == 'list':
                    html_content += '</ul>'
                    current_list_type = None
                html_content += element_html
    
    # Close any open lists
    if current_list_type == 'list':
        html_content += '</ul>'
    
    html_content += '''
        </div>
    </body>
    </html>
    '''
    
    # Save the HTML file
    with open(output_html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)
    
    print(f"HTML generated from JSON and saved to: {output_html_path}")
    return output_html_path

def FigureExport():
    IMAGE_RESOLUTION_SCALE = 2.0
    logging.basicConfig(level=logging.INFO)
    input_doc_path = get_single_input_file("Data/PDF Files", "Master Approval Ltr (1)")
    output_dir = get_output_path(input_doc_path, base_output="Output-new")
    
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline) # Will try with PdfPipelineOptions()
        }
    )
    
    start_time = time.time()
    conv_res = doc_converter.convert(input_doc_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem
    
    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")
    
    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, *level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")
        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")
    
    # Right now removed saving as REFERENCED as it was creating RECURSIVE FOLDER
    # https://docling-project.github.io/docling/examples/export_figures/
    
    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(
        md_filename,
        image_mode=ImageRefMode.EMBEDDED
    )
    
    # Save json with embedded pictures
    json_filename = output_dir / f"{doc_filename}-with-images.json"
    conv_res.document.save_as_json(
        json_filename,
        image_mode=ImageRefMode.EMBEDDED
    )
    
    # Generate HTML from JSON instead of directly from document
    html_filename = output_dir / f"{doc_filename}-from-json.html"
    json_to_html_converter(json_filename, html_filename)
    
    end_time = time.time() - start_time
    logging.info(f"Document converted and HTML generated from JSON in {end_time:.2f} seconds.")

# Call the function
FigureExport()

INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 45e823ad9aa4b6fa53c56667a4a8e97c
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Ltr (1).pdf
INFO:docling.document_converter:Finished converting document Master Approval Ltr (1).pdf in 3.75 sec.
INFO:root:Document converted and HTML generated from JSON in 3.96 seconds.


HTML generated from JSON and saved to: Output-new/PDF Files/Master Approval Ltr (1)/Master Approval Ltr (1)-from-json.html


In [27]:
import time
import logging
import json
import re
from pathlib import Path

def debug_json_structure(json_file_path):
    """
    Debug function to examine the JSON structure
    """
    with open(json_file_path, 'r', encoding='utf-8') as f:
        doc_data = json.load(f)
    
    print("=== JSON Structure Debug ===")
    print(f"Top-level keys: {list(doc_data.keys())}")
    
    # Check different possible locations for content
    for key in doc_data.keys():
        print(f"\n--- {key} ---")
        value = doc_data[key]
        if isinstance(value, list):
            print(f"List with {len(value)} items")
            if len(value) > 0:
                print(f"First item type: {type(value[0])}")
                if isinstance(value[0], dict):
                    print(f"First item keys: {list(value[0].keys())}")
        elif isinstance(value, dict):
            print(f"Dict with keys: {list(value.keys())}")
        else:
            print(f"Type: {type(value)}, Value preview: {str(value)[:100]}...")
    
    # Check for main_text specifically
    if 'main_text' in doc_data:
        main_text = doc_data['main_text']
        print(f"\n=== main_text analysis ===")
        print(f"Length: {len(main_text)}")
        for i, item in enumerate(main_text[:5]):  # First 5 items
            print(f"Item {i}: {item}")
    
    return doc_data

def improved_json_to_html_converter(json_file_path, output_html_path):
    """
    Improved JSON to HTML converter with better structure detection
    Supports Docling keys: texts, pictures, tables
    """
    # First debug the structure
    print("Debugging JSON structure...")
    doc_data = debug_json_structure(json_file_path)

    # Custom CSS
    custom_css = '''
<style>
    body {
        font-family: Arial, Helvetica, sans-serif !important;
        font-size: 11pt !important;
        line-height: 1.4 !important;
        margin: 0 auto !important;
        padding: 0.5in !important;
        max-width: 8.5in !important;
        background: #fff !important;
        color: #000 !important;
    }

    .page {
        width: 100% !important;
        max-width: 8.5in !important;
        min-height: 11in !important;
        margin: 0 auto !important;
        page-break-after: always;
    }

    h1, h2, h3, h4, h5, h6 {
        font-weight: bold !important;
        text-align: center !important;
        margin: 12px 0 !important;
    }

    p {
        margin: 6px 0 !important;
    }

    .merge-field {
        background-color: #ffffcc !important;
        padding: 1px 3px !important;
        border: 1px dashed #ccc !important;
        font-style: italic !important;
    }

    table {
        width: 100% !important;
        border-collapse: collapse !important;
        margin: 15px 0 !important;
        font-size: 10pt !important;
    }
    th, td {
        border: 1px solid #000 !important;
        padding: 6px !important;
        text-align: left !important;
    }
    th {
        background: #f0f0f0 !important;
    }

    img {
        max-width: 100% !important;
        height: auto !important;
        display: block !important;
        margin: 10px auto !important;
    }
</style>
'''

    def highlight_merge_fields(text):
        """Highlight merge fields in curly braces"""
        if not text:
            return text
        return re.sub(r'\{([^}]+)\}', r'<span class="merge-field">{\1}</span>', text)

    def extract_text_content(doc_data):
        """Extract text, picture captions, and tables"""
        content = []
        possible_keys = ['main_text', 'content', 'text', 'elements', 'body', 'texts', 'pictures', 'tables']

        for key in possible_keys:
            if key in doc_data:
                data = doc_data[key]
                print(f"Found content in '{key}' with {len(data) if isinstance(data, list) else 'non-list'} items")

                if isinstance(data, list):
                    if key == 'texts':
                        for item in data:
                            if isinstance(item, dict) and 'text' in item:
                                content.append({'type': 'paragraph', 'text': item['text']})

                    elif key == 'pictures':
                        for item in data:
                            if isinstance(item, dict):
                                caption = " ".join(item.get('captions', []))
                                content.append({
                                    'type': 'image',
                                    'text': caption,
                                    'image': item.get('image', "")
                                })

                    elif key == 'tables':
                        for item in data:
                            if isinstance(item, dict):
                                rows = item.get('data', [])
                                table_html = "<table>" + "".join(
                                    "<tr>" + "".join(f"<td>{cell}</td>" for cell in row) + "</tr>"
                                    for row in rows
                                ) + "</table>"
                                content.append({'type': 'table', 'text': table_html})

                    else:
                        content.extend(data)

                elif isinstance(data, str):
                    content.append({'type': 'paragraph', 'text': data})

        # Look deeper if still empty
        if not content:
            for key, value in doc_data.items():
                if isinstance(value, dict):
                    nested = extract_text_content(value)
                    if nested:
                        content.extend(nested)
                        break
        return content

    def process_text_item(item):
        """Process text, image, or table items"""
        if isinstance(item, str):
            return f'<p>{highlight_merge_fields(item)}</p>'

        elif isinstance(item, dict):
            text = highlight_merge_fields(item.get('text', ''))
            item_type = item.get('type', 'paragraph')

            if item_type == 'image':
                img_tag = f'<img src="data:image/png;base64,{item.get("image","")}" alt="embedded image"/>'
                caption = f"<p><em>{text}</em></p>" if text else ""
                return img_tag + caption

            elif item_type == 'table':
                return item['text']  # already HTML

            elif item_type in ['title', 'heading']:
                return f'<h2>{text}</h2>'

            else:
                return f'<p>{text}</p>'

        return ''

    # Start building HTML
    html_content = f'''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Converted Document</title>
    {custom_css}
</head>
<body>
    <div class="page">
'''

    # Extract structured content
    content = extract_text_content(doc_data)
    print(f"Extracted {len(content)} content items")

    if not content:
        # Fallback: deep search for "text" keys
        print("No structured content found, using fallback...")
        def extract_all_text(obj, texts=[]):
            if isinstance(obj, dict):
                for k, v in obj.items():
                    if k == 'text' and isinstance(v, str) and v.strip():
                        texts.append(v.strip())
                    else:
                        extract_all_text(v, texts)
            elif isinstance(obj, list):
                for i in obj:
                    extract_all_text(i, texts)
            return texts

        all_texts = extract_all_text(doc_data)
        for txt in all_texts:
            html_content += f'<p>{highlight_merge_fields(txt)}</p>\n'
    else:
        for item in content:
            processed = process_text_item(item)
            if processed:
                html_content += processed + "\n"

    html_content += '''
    </div>
</body>
</html>
'''

    # Save file
    with open(output_html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"HTML generated and saved to: {output_html_path}")
    return output_html_path

def FigureExport():
    IMAGE_RESOLUTION_SCALE = 2.0
    logging.basicConfig(level=logging.INFO)
    input_doc_path = get_single_input_file("Data/PDF Files", "Master Approval Ltr (1)")
    output_dir = get_output_path(input_doc_path, base_output="Output-new")
    
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline)
        }
    )
    
    start_time = time.time()
    conv_res = doc_converter.convert(input_doc_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem
    
    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")
    
    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, *level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")
        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")
    
    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(
        md_filename,
        image_mode=ImageRefMode.EMBEDDED
    )
    
    # Save json with embedded pictures
    json_filename = output_dir / f"{doc_filename}-with-images.json"
    conv_res.document.save_as_json(
        json_filename,
        image_mode=ImageRefMode.EMBEDDED
    )
    
    # Generate HTML from JSON with debugging
    html_filename = output_dir / f"{doc_filename}-from-json.html"
    improved_json_to_html_converter(json_filename, html_filename)
    
    end_time = time.time() - start_time
    logging.info(f"Document converted and HTML generated from JSON in {end_time:.2f} seconds.")

# Call the function
FigureExport()

INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 45e823ad9aa4b6fa53c56667a4a8e97c
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Ltr (1).pdf
INFO:docling.document_converter:Finished converting document Master Approval Ltr (1).pdf in 3.98 sec.
INFO:root:Document converted and HTML generated from JSON in 4.20 seconds.


Debugging JSON structure...
=== JSON Structure Debug ===
Top-level keys: ['schema_name', 'version', 'name', 'origin', 'furniture', 'body', 'groups', 'texts', 'pictures', 'tables', 'key_value_items', 'form_items', 'pages']

--- schema_name ---
Type: <class 'str'>, Value preview: DoclingDocument...

--- version ---
Type: <class 'str'>, Value preview: 1.5.0...

--- name ---
Type: <class 'str'>, Value preview: Master Approval Ltr (1)...

--- origin ---
Dict with keys: ['mimetype', 'binary_hash', 'filename']

--- furniture ---
Dict with keys: ['self_ref', 'children', 'content_layer', 'name', 'label']

--- body ---
Dict with keys: ['self_ref', 'children', 'content_layer', 'name', 'label']

--- groups ---
List with 2 items
First item type: <class 'dict'>
First item keys: ['self_ref', 'parent', 'children', 'content_layer', 'name', 'label']

--- texts ---
List with 35 items
First item type: <class 'dict'>
First item keys: ['self_ref', 'parent', 'children', 'content_layer', 'label', 'prov', 'ori

In [17]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
    OcrMacOptions,
    RapidOcrOptions,
    EasyOcrOptions

)
from docling.document_converter import DocumentConverter, PdfFormatOption

In [36]:
def main():
    data_folder = ""
    input_doc_path ="Data/PDF Files/Master Approval Ltr (1).pdf"

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
    # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
    # ocr_options = OcrMacOptions(force_full_page_ocr=True)
    # ocr_options = RapidOcrOptions(force_full_page_ocr=True)
    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    doc = converter.convert(input_doc_path).document
    md = doc.export_to_dict()
    print(md)

In [48]:
import json
from pathlib import Path

def main():
    input_doc_path = "Data/PDF Files/Master Approval Ltr (1).pdf"

    # Set up output directory
    output_dir = Path("Output-ocr-rapid")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Configure pipeline
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.generate_page_images=True,       # Enable full-page images
    pipeline_options.generate_picture_images=True 
    pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = RapidOcrOptions(force_full_page_ocr=True)

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )

    doc = converter.convert(input_doc_path).document

    # Save JSON with embedded images
    json_path = output_dir / "document-with-images.json"
    doc.save_as_json(json_path, image_mode=ImageRefMode.EMBEDDED)
    print(f"✅ JSON with images saved to {json_path}")

    # Save HTML with embedded images
    html_path = output_dir / "document-with-images.html"
    doc.save_as_html(html_path, image_mode=ImageRefMode.EMBEDDED)
    print(f"✅ HTML with images saved to {html_path}")

    # (Optional) also save Markdown with images
    md_path = output_dir / "document-with-images.md"
    doc.save_as_markdown(md_path, image_mode=ImageRefMode.EMBEDDED)
    print(f"✅ Markdown with images saved to {md_path}")

if __name__ == "__main__":
    main()


INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
  PydanticSerializationUnexpectedValue(Expected `bool` - serialized value may not be as expected [input_value=(True,), input_type=tuple])
  return self.__pydantic_serializer__.to_python(
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 12edadb9462f377bd55b1cba98686c51
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
2025-08-19 07:26:53,076 - OrtInferSession - INFO: !!!Recommend to use rapidocr_paddle for inference on GPU.
INFO:OrtInferSession:!!!Recommend to use rapidocr_paddle for inference on GPU.
2025-08-19 07:26:53,077 - OrtInferSession - INFO: (For reference only) If you want to use GPU acceleration, you must do:
INFO:OrtInferSession:(For reference only) If you want to use GPU acceleration, you must do:
2025-08-19 07:26:53,077 - OrtInferSession - INFO: First, uninstall all onn

✅ JSON with images saved to Output-ocr-rapid/document-with-images.json
✅ HTML with images saved to Output-ocr-rapid/document-with-images.html
✅ Markdown with images saved to Output-ocr-rapid/document-with-images.md


In [18]:
def FigureExport():
    IMAGE_RESOLUTION_SCALE = 2.0
    logging.basicConfig(level=logging.INFO)

    input_doc_path = get_single_input_file("Data/PDF Files", "Master Approval Ltr (1)")
    output_dir = get_output_path(input_doc_path, base_output="Output-EasyOcrOptions")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
    # will destroy them for cleaning up memory.
    # This is done by setting PdfPipelineOptions.images_scale, which also defines the scale of images.
    # scale=1 correspond of a standard 72 DPI image
    # The PdfPipelineOptions.generate_* are the selectors for the document elements which will be enriched
    # with the image field
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
    # ocr_options = OcrMacOptions(force_full_page_ocr=True)
    # ocr_options = RapidOcrOptions(force_full_page_ocr=True)
    # ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline) # Will try with PdfPipelineOptions(), SimplePipeline
        }
    )

    start_time = time.time()

    conv_res = doc_converter.convert(input_doc_path)

    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-table-{table_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = (
                output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            )
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Right now removed saving as REFERENCED as it was creating RECURSIVE FOLDER
    # https://docling-project.github.io/docling/examples/export_figures/
    
    # Save markdown with embedded pictures
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(
        md_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save json with embedded pictures
    json_filename = output_dir / f"{doc_filename}-with-images.json"
    conv_res.document.save_as_json(
        json_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    # Save HTML with embedded pictures
    html_filename = output_dir / f"{doc_filename}-with-images.html"
    conv_res.document.save_as_html(
        html_filename,
        image_mode=ImageRefMode.EMBEDDED
    )

    end_time = time.time() - start_time

    _log.info(f"Document converted and figures exported in {end_time:.2f} seconds.")

FigureExport()

INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 7caed1e3295ed934b360d59e994b4762
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.pipeline.base_pipeline:Processing document Master Approval Ltr (1).pdf
INFO:docling.document_converter:Finished converting document Master Approval Ltr (1).pdf in 5.54 sec.
INFO:__main__:Document converted and figures exported in 5.77 seconds.


In [56]:
pip install tesserocr


Collecting tesserocr
  Downloading tesserocr-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (10 kB)
Downloading tesserocr-2.8.0-cp310-cp310-manylinux_2_28_x86_64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: tesserocr
Successfully installed tesserocr-2.8.0
Note: you may need to restart the kernel to use updated packages.


In [25]:
from docling_core.types.doc import TextItem

def enrich_document_with_styles(doc):
    page_width = 595  # PDF width in pt (A4 default). Replace with real if known.

    for element, _ in doc.iterate_items():
        if isinstance(element, TextItem) and element.prov:
            bbox = element.prov[0].bbox
            height = bbox.t - bbox.b

            # --- Estimate font size ---
            font_size = round(height)

            # --- Estimate alignment ---
            center_tolerance = 30
            text_mid = (bbox.l + bbox.r) / 2

            if abs(text_mid - page_width / 2) < center_tolerance:
                alignment = "center"
            elif bbox.l < 100:
                alignment = "left"
            elif page_width - bbox.r < 100:
                alignment = "right"
            else:
                alignment = "justify"

            # ✅ Store in .extra instead of .metadata
            element.extra["font_size"] = font_size
            element.extra["alignment"] = alignment

    return doc


In [28]:


def enrich_json_with_styles(json_file, page_width=595):
    """Add font_size and alignment fields to text objects in JSON."""
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    for text_obj in data.get("texts", []):
        prov = text_obj.get("prov", [])
        if prov:
            bbox = prov[0].get("bbox", {})
            if bbox:
                # --- font size from bbox height ---
                height = bbox.get("t", 0) - bbox.get("b", 0)
                text_obj["font_size"] = round(height)

                # --- alignment from bbox position ---
                center_tolerance = 30
                mid = (bbox.get("l", 0) + bbox.get("r", 0)) / 2
                if abs(mid - page_width/2) < center_tolerance:
                    text_obj["alignment"] = "center"
                elif bbox.get("l", 0) < 100:
                    text_obj["alignment"] = "left"
                elif page_width - bbox.get("r", 0) < 100:
                    text_obj["alignment"] = "right"
                else:
                    text_obj["alignment"] = "justify"

    # overwrite JSON with enriched content
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    logging.info(f"✅ Enriched JSON saved with font_size & alignment → {json_file}")


def FigureExport():
    IMAGE_RESOLUTION_SCALE = 2.0
    logging.basicConfig(level=logging.INFO)

    input_doc_path = Path("Data/PDF Files/Master Approval Ltr (1).pdf")
    output_dir = Path("Output-withsize")
    output_dir.mkdir(parents=True, exist_ok=True)

    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    pipeline_options.ocr_options = EasyOcrOptions()   # ← set OCR if you want (EasyOcrOptions, TesseractOcrOptions etc.)

    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.DOCX: WordFormatOption(pipeline_cls=SimplePipeline)
        }
    )

    start_time = time.time()
    conv_res = doc_converter.convert(input_doc_path)
    doc_filename = conv_res.input.file.stem

    # Save page images
    for page_no, page in conv_res.document.pages.items():
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")

    # Save images of figures and tables
    table_counter = 0
    picture_counter = 0
    for element, _level in conv_res.document.iterate_items():
        if isinstance(element, TableItem):
            table_counter += 1
            element_image_filename = output_dir / f"{doc_filename}-table-{table_counter}.png"
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

        if isinstance(element, PictureItem):
            picture_counter += 1
            element_image_filename = output_dir / f"{doc_filename}-picture-{picture_counter}.png"
            with element_image_filename.open("wb") as fp:
                element.get_image(conv_res.document).save(fp, "PNG")

    # Save JSON
    json_filename = output_dir / f"{doc_filename}-with-images.json"
    conv_res.document.save_as_json(json_filename, image_mode=ImageRefMode.EMBEDDED)

    # 🔥 Enrich JSON with font size & alignment
    enrich_json_with_styles(json_filename)

    # Save markdown
    md_filename = output_dir / f"{doc_filename}-with-images.md"
    conv_res.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

    # Save HTML
    html_filename = output_dir / f"{doc_filename}-with-images.html"
    conv_res.document.save_as_html(html_filename, image_mode=ImageRefMode.EMBEDDED)

    end_time = time.time() - start_time
    logging.info(f"Document converted and enriched in {end_time:.2f} seconds.")

FigureExport()


INFO:docling.datamodel.document:detected formats: [<InputFormat.PDF: 'pdf'>]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash 8c0b37dae868a9ee283b4e66a543d336


AttributeError: 'NoneType' object has no attribute 'kind'

In [31]:
from docx import Document
from docx.shared import Inches

# Create new Word document
doc = Document()

# ---------- PAGE 1 ----------
doc.add_paragraph("ATTN: ​Molina Healthcare|EMU Utilization Management​|Advanced Imaging​​|Transplant{Workflow}")
doc.add_paragraph("200 Oceangate, Suite 100")
doc.add_paragraph("Long Beach, CA 90802\n")

doc.add_paragraph("{MemFirstName} {MemMiddleName} {MemLastName}")
doc.add_paragraph("{MemAddress1} {MemAddress2}")
doc.add_paragraph("{MemCity}, {MemState} {MemZipCode}")
doc.add_paragraph("{ProvFirstName} {ProvLastName}")
doc.add_paragraph("{ProvAddress1} {ProvAddress2}")
doc.add_paragraph("{ProvCity}, {ProvState} {ProvZipCode}")

doc.add_page_break()

# Insert Logo


# ---------- PAGE 2 ----------
doc.add_picture("Screenshot 2025-08-19 163406.png", width=Inches(3.5))
doc.add_paragraph("Notice of Authorization\n")

doc.add_paragraph("Member Name:")
doc.add_paragraph("Optional (Care of):")
doc.add_paragraph("Member ID Number:")
doc.add_paragraph("Member DOB:")
doc.add_paragraph("Requesting Provider:")
doc.add_paragraph("Date of Request:")
doc.add_paragraph("Authorization Number:\n")

# Add table for Requested Service
table = doc.add_table(rows=2, cols=3)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Requested Service/Item'
hdr_cells[1].text = 'Quantity'
hdr_cells[2].text = 'Dates of Service'

row_cells = table.rows[1].cells
row_cells[0].text = 'Requested Service'
row_cells[1].text = 'Authorized Services'
row_cells[2].text = 'Start - End Date'

doc.add_paragraph("\nDear Member,\n")
doc.add_paragraph(
    "Thank you for being a valued member of our plan. "
    "We reviewed the request for the service(s) or item(s) listed above. "
    "We are pleased to inform you that we have approved the requested service(s) or item(s).\n\n"
    "You or your doctor or health care provider may call us if you need more of these service(s) or item(s). "
    "Additional service(s) or item(s) may require review and approval based on your plan, provider notes, "
    "and applicable rules at the time of service.\n\n"
    "Please call Member Services at 711 if you have any questions.\n\n"
    "Sincerely,"
)

doc.add_paragraph("UM_CCT_Reviewed_09/2023")

# Page Break
doc.add_page_break()
doc.add_picture("Screenshot 2025-08-19 163406.png", width=Inches(3.5))  # adjust size as needed

# Add Text
doc.add_paragraph("Short Plan Name:")
doc.add_paragraph("Alternate Format Statement")
doc.add_paragraph("Alternate Format Statement")
doc.add_paragraph("Federal Contracting Disclaimer\n")
doc.add_paragraph("UM_CCT_Reviewed_09/2023")

# Page Break
# ---------- PAGE 3 ----------
# Blank Page
doc.add_page_break()

# ---------- PAGE 4 ----------



doc.add_paragraph("Provider Name")
doc.add_paragraph("Provider Address Line 1")
doc.add_paragraph("Provider Address Line 2")
doc.add_paragraph("City, State Zip Code")

# Save the document
doc.save("Authorization_Letter_with_Logo.docx")

print("✅ Word file 'Authorization_Letter_with_Logo.docx' has been created!")

✅ Word file 'Authorization_Letter_with_Logo.docx' has been created!
