In [None]:
path = '../data/testdoc.pdf'

In [None]:
from docling_core.types.doc import BoundingBox

In [None]:
import time
import logging
from pathlib import Path
from typing import Optional
import torch
import os
import fitz
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc import PictureItem, ImageRefMode
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.document_converter import DocumentConverter, PdfFormatOption,  ImageFormatOption
from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    TesseractOcrOptions,
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)

In [None]:
def _get_pipeline_options(**kwargs):
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_ocr = kwargs.get('do_ocr', True)

        if torch.cuda.is_available():
            pipeline_options.ocr_options = EasyOcrOptions(
                use_gpu=kwargs.get('use_gpu', True),
                lang=kwargs.get('lang', ['en', 'ne']),
                confidence_threshold=kwargs.get('confidence_threshold', 0.1),
            )
            pipeline_options.accelerator_options = AcceleratorOptions(
                num_threads=4, device=AcceleratorDevice.CUDA
            )
        
        else:
            pipeline_options.ocr_options = TesseractOcrOptions(
                lang=kwargs.get('tess_lang', ['eng', 'nep']),
            )
                
        pipeline_options.do_table_structure = kwargs.get("do_table_structure", True)
        pipeline_options.table_structure_options.do_cell_matching = kwargs.get("do_cell_matching", True)
        # pipeline_options.images_scale = kwargs.get("images_scale", 2.0)
        # pipeline_options.generate_page_images = kwargs.get("generate_page_images", True)
        # pipeline_options.generate_picture_images = kwargs.get("generate_picture_images", True)

        return pipeline_options

In [None]:
pipeline_options = _get_pipeline_options(do_ocr=False)

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=pipeline_options,
            backend=PyPdfiumDocumentBackend,
        ),
    }
)

In [None]:
conv_result = doc_converter.convert(path).document

In [None]:
def extract_items_by_bbox_and_page(docling_document, target_bbox, target_page_no):
    """
    Extracts text and table items from a docling document within a given bounding box and page number.

    Args:
        docling_document (object): The document object obtained from `doc_converter.convert(path).document`.
        target_bbox (tuple): The target bounding box (x0, y0, x1, y1).
        target_page_no (int): The page number to filter items.

    Returns:
        list: Combined list of text and table objects matching the criteria.
    """
    x0, y0, x1, y1 = target_bbox
    items = []

    # Combine texts and tables
    elements = docling_document.texts + docling_document.tables

    for element in elements:
        # Get the page number and bbox
        prov = element.prov[0]
        if prov.page_no != target_page_no:
            continue  # Skip if the page number doesn't match

        element_bbox = prov.bbox.as_tuple()
        ex0, ey0, ex1, ey1 = element_bbox

        # Check if the element bbox is inside the target bbox
        if ex0 >= x0 and ey0 >= y0 and ex1 <= x1 and ey1 <= y1:
            items.append(element)

    return items


In [None]:
from docling_core.types.doc import BoundingBox, CoordOrigin

In [None]:

box = BoundingBox.from_tuple((56.79999923706055, 236.43328857421875, 559.0999755859375, 720.9435424804688), origin=CoordOrigin.TOPLEFT)

In [None]:
x0, y0, x1, y1 = box.as_tuple()
items = []

# Combine texts and tables
elements = conv_result.texts + conv_result.tables

for element in elements:
    # Get the page number and bbox
    prov = element.prov[0]
    if prov.page_no != 4:
        continue  # Skip if the page number doesn't match

    element_bbox = prov.bbox.as_tuple()
    ex0, ey0, ex1, ey1 = element_bbox

    # Check if the element bbox is inside the target bbox
    if ex0 >= x0 and ey0 >= y0 and ex1 <= x1 and ey1 <= y1:
        items.append(element)


In [None]:
elemet_refs = []

In [None]:
conv_result.body.children

In [None]:
def get_element_index(cref):
    children_refs = [c.cref for c in conv_result.body.children]


In [None]:
children_refs = [c.cref for c in conv_result.body.children] # But some refs have #/groups/ reference, that contain multiple children elements
def resolve_children(ref, conv_result):
    """
    Resolves a reference, replacing group references with their children recursively.
    """
    if ref.startswith("#/groups/"):
        group_index = int(ref.split("/")[-1])
        group_children = [resolve_children(c.cref, conv_result) for c in conv_result.groups[group_index].children]
        return [child for sublist in group_children for child in (sublist if isinstance(sublist, list) else [sublist])]
    else:
        return ref
    
updated_children_refs = []
for ref in children_refs:
    resolved = resolve_children(ref, conv_result)
    if isinstance(resolved, list):
        updated_children_refs.extend(resolved)
    else:
        updated_children_refs.append(resolved)

In [None]:
conv_result.texts[6].get_ref()

In [None]:
Markdown(conv_result.export_to_markdown(from_element=0, to_element=7))

In [None]:
elem_ref = conv_result.texts[5].get_ref().cref

In [None]:
children_refs.index(elem_ref)

In [None]:
items

In [None]:
import npttf2utf

def is_english_word(word: str) -> bool:
    """
    Check if a word is an English word.

    Args:
        word (str): The word to check.

    Returns:
        bool: True if the word is an English word, False otherwise.
    """
    english_dict = enchant.Dict("en_US")
    try:
        word = word.lower().strip()
        word = word.strip(string.punctuation)
        if not word.isalpha():
            return False
        return english_dict.check(word)
    except ValueError:
        return False


def map_to_unicode(text, check_english_words: bool = False) -> str:
    """
    Map the text to Unicode characters using the font mapper.

    Args:
        text (str): The text to map.
        check_english_words (bool): Whether to check and skip English words (default: False).

    Returns:
        str: The text mapped to Unicode characters.
    """
    mapper = npttf2utf.FontMapper(
        os.path.abspath(
            os.path.join(
                "..",
                "assets", 
                "font_mapper.json"
        )
    ))

    if not check_english_words:
        return mapper.map_to_unicode(
            text, 
            unescape_html_input=False, 
            escape_html_output=False
        )
    
    mapped_text = []
    for word in text.split(" "):
        if not is_english_word(word):
            mapped_word = mapper.map_to_unicode(
                word, 
                unescape_html_input=False, 
                escape_html_output=False
            )
            mapped_text.append(mapped_word)
        else:
            mapped_text.append(word)

    return " ".join(mapped_text)

In [None]:
def get_text_in_bbox(doc: fitz.Document, page: int, bbox: fitz.Rect) -> str:
    """
    Extract text within a bounding box on a given page and map to Unicode.
    """
    # Extract text and associated font details from the bounding box
    page_obj = doc[page]
    text_instances = page_obj.get_text("dict", clip=bbox)["blocks"]
    fonts_to_map = []

    fonts_file_path = os.path.abspath(
    os.path.join(
        "..",
        "assets", 
        "nepali_fonts.txt"
    ))

    with open(fonts_file_path, "r") as f:
        fonts_to_map = f.read().split("\n")
    extracted_text = []
    for block in text_instances:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                font = span.get("font", "")
                text = span.get("text", "")
                if font in fonts_to_map:
                    text = map_to_unicode(text)  # Convert to Unicode if the font is Preeti
                extracted_text.append(text)

    return " ".join(extracted_text)

In [None]:
fitz_doc=fitz.open(path)
for item in items:
    prov = item.prov[0]
    page_no = prov.page_no

    bbox = prov.bbox
    bbox = bbox.to_top_left_origin(conv_result.pages[page_no].size.height) 

    fitz_text = get_text_in_bbox(fitz_doc, page=page_no-1, bbox=bbox.as_tuple())
    if len(fitz_text)>10:
        item.text = fitz_text

In [None]:
table = conv_result.tables[0]

In [None]:
table.prov[0]

In [None]:
for cell in table.data.table_cells:
    page_no = table.prov[0].page_no
    page_height = conv_result.pages[page_no].size.height
    box = cell.bbox.to_top_left_origin(page_height)

    cell.text = get_text_in_bbox(fitz_doc, page_no-1, box.as_tuple())


In [None]:
from IPython.display import Markdown

In [None]:
Markdown(table.export_to_markdown())

In [None]:
Markdown(conv_result.export_to_markdown(from_element=20, to_element=25))