In [3]:
from docling.document_converter import DocumentConverter
import json
import time

source = r"/home/ubuntu/Desktop/OmniPDF/sample-files/NYJC 2021 H1 Physics 8867 P1 Answer.pdf"  # document per local path or URL

start_time = time.time()

converter = DocumentConverter()
doc = converter.convert(source).document

data = doc.export_to_dict()

for ref in ['body', 'groups']:
            data.pop(ref, None)


with open("./output.json", "w") as f:
    json.dump(data, f, indent=4)

print(f"Time taken: {time.time() - start_time}")

Time taken: 12.466126441955566


In [6]:
import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

IMAGE_RESOLUTION_SCALE = 2.0

In [11]:
from pathlib import Path
import time
from docling_core.types.doc import PictureItem, TableItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

def extract_images(source_pdf: Path, output_dir: Path, scale: float = 2.0):
    opts = PdfPipelineOptions()
    opts.images_scale = scale
    opts.generate_page_images = True
    opts.generate_picture_images = True

    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
    )

    res = converter.convert(source_pdf)

    output_dir.mkdir(exist_ok=True)
    base = source_pdf.stem

    # Save each figure or table image
    pic_cnt, table_cnt = 0, 0
    for element, _ in res.document.iterate_items():
        if isinstance(element, PictureItem):
            pic_cnt += 1
            fname = output_dir / f"{base}-picture-{pic_cnt}.png"
        elif isinstance(element, TableItem):
            table_cnt += 1
            fname = output_dir / f"{base}-table-{table_cnt}.png"
        else:
            continue

        # Write the image
        img = element.get_image(res.document)
        img.save(fname, "PNG")

    print(f"Extracted {pic_cnt} pictures and {table_cnt} tables.")

    # return res.document.export_to_dict()


In [16]:
from pathlib import Path
import time
from docling_core.types.doc import PictureItem, TableItem
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

def extract_image(source_pdf: Path, output_dir: Path, scale: float = 2.0):
    opts = PdfPipelineOptions()
    opts.images_scale = scale
    opts.generate_page_images = True
    opts.generate_picture_images = True

    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
    )

    res = converter.convert(source_pdf)

    output_dir.mkdir(exist_ok=True)
    base = source_pdf.stem

    pic_cnt, table_cnt = 0, 0
    for element, _ in res.document.iterate_items():
        if isinstance(element, PictureItem):
            pic_cnt += 1
            fname = output_dir / f"{base}-picture-{pic_cnt}.png"
            bbox_info = element.prov[0].bbox if element.prov else None
            print(f"[Picture {pic_cnt}] Page {element.prov[0].page_no} BBox: {bbox_info}")

        elif isinstance(element, TableItem):
            table_cnt += 1
            fname = output_dir / f"{base}-table-{table_cnt}.png"
            bbox_info = element.prov[0].bbox if element.prov else None
            print(f"[Table {table_cnt}] Page {element.prov[0].page_no} BBox: {bbox_info}")
        else:
            continue

        img = element.get_image(res.document)
        img.save(fname, "PNG")

    print(f"Extracted {pic_cnt} pictures and {table_cnt} tables.")

In [17]:
source = Path("/home/ubuntu/Desktop/OmniPDF/sample-files/NYJC 2021 H1 Physics 8867 P1 Answer.pdf")
output_dir = Path("./img_test")


data = extract_image(source_pdf=source, output_dir=output_dir, scale=2.0)

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Initializing pipeline for StandardPdfPipeline with options hash da04cf770b09a45fa6de4d8ea900b7bf
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.utils.accelerator_utils:Accelerator device: 'cuda:0'
INFO:docling.pipeline.base_pipeline:Processing document NYJC 2021 H1 Physics 8867 P1 Answer.pdf
INFO:docling.document_converter:Finished converting document NYJC 2021 H1 Physics 8867 P1 Answer.pdf in 13.62 sec.


[Picture 1] Page 1 BBox: l=56.9237060546875 t=772.1593780517578 r=118.49898529052734 b=706.8583221435547 coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>
[Picture 2] Page 4 BBox: l=112.28466033935547 t=665.0919952392578 r=533.45703125 b=359.3226623535156 coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>
[Picture 3] Page 5 BBox: l=152.28164672851562 t=698.5591583251953 r=460.92242431640625 b=561.9471435546875 coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>
[Picture 4] Page 5 BBox: l=102.35623931884766 t=472.9694519042969 r=531.9268188476562 b=376.6311950683594 coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>
[Table 1] Page 5 BBox: l=79.2287368774414 t=315.8590087890625 r=383.1485595703125 b=176.42694091796875 coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>
[Picture 5] Page 7 BBox: l=113.6047592163086 t=751.9551620483398 r=490.1370544433594 b=700.7430572509766 coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>
[Picture 6] Page 7 BBox: l=159.55690002441406 t=628.884323120

In [15]:
data

{'schema_name': 'DoclingDocument',
 'version': '1.4.0',
 'name': 'NYJC 2021 H1 Physics 8867 P1 Answer',
 'origin': {'mimetype': 'application/pdf',
  'binary_hash': 15524432217299857479,
  'filename': 'NYJC 2021 H1 Physics 8867 P1 Answer.pdf'},
 'furniture': {'self_ref': '#/furniture',
  'children': [],
  'content_layer': 'furniture',
  'name': '_root_',
  'label': 'unspecified'},
 'texts': [{'self_ref': '#/texts/0',
   'parent': {'$ref': '#/groups/0'},
   'children': [],
   'content_layer': 'body',
   'label': 'text',
   'prov': [{'page_no': 1,
     'bbox': {'l': 145.2,
      't': 754.815,
      'r': 315.011,
      'b': 712.477,
      'coord_origin': 'BOTTOMLEFT'},
     'charspan': [0, 53]}],
   'orig': 'NANYANG JUNIOR COLLEGE JC 2 PRELIMINARY EXAM Higher 1',
   'text': 'NANYANG JUNIOR COLLEGE JC 2 PRELIMINARY EXAM Higher 1'},
  {'self_ref': '#/texts/1',
   'parent': {'$ref': '#/groups/0'},
   'children': [],
   'content_layer': 'body',
   'label': 'text',
   'prov': [{'page_no': 1,
  

In [1]:
import requests

# system_prompt = "Translate the following from english to chinese"


#################### Qwen setup ####################
# LLM_URL = "http://192.168.1.108:8000/v1/chat/completions" #chat/completions  
LLM_URL = "http://192.168.1.224:80/v1/chat/completions"

TOKEN = "token-abc123"

def translate(prompt, input_lang=None, output_lang="English"):
    if input_lang:
        system_prompt = (
            f"You are a professional translator. Given the input language '{input_lang}', "
            f"think deeply and translate the following to '{output_lang}'. "
            f"Return only the translated text."
        )
    else:
        system_prompt = (
            f"You are a professional translator. Think deeply and translate the following to '{output_lang}'. "
            f"Detect the source language automatically and return only the translated text."
        )

    r = requests.post(
        LLM_URL,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {TOKEN}"
        },
        json={
            "model": "qwen2.5",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            "temperature": 0
        }
    )

    return r.json()["choices"][0]["message"]["content"]

translate("I took the bus to school", input_lang="english", output_lang="malay")

'Saya membelakang ke sekolah.'

### Handle texts

In [17]:
input_lang="english"
output_lang="malay"

for entry in data["texts"]:
    original_text = entry.get("text") or entry.get("orig")
    if original_text:
        translated_text = translate(original_text, input_lang=input_lang, output_lang=output_lang)
        entry["trans"] = translated_text

with open("translated.json", "w") as f:
    json.dump(data, f, indent=4)


### Handle table (table cells)

In [19]:
for table in data["tables"]:
    for entry in table["data"]["table_cells"]:
        original_text = entry.get("text")
        if original_text:
            translated_text = translate(original_text, input_lang=input_lang, output_lang=output_lang)
            entry["trans"] = translated_text

In [20]:
with open("translated_more.json", "w") as f:
    json.dump(data, f, indent=4)

In [1]:
LLM_URL = "http://192.168.1.108:80/v1/chat/completions"
# LLM_URL = "http://192.168.1.197:80/v1/chat/completions"
TOKEN = "token-abc123"

def translate(prompt, source_lang=None, target_lang="English"):
    if source_lang:
        system_prompt = (
            f"You are a professional translator. Given the input language '{source_lang}', "
            f"think deeply and translate the following to '{target_lang}'. "
            f"Return only the translated text."
        )
    else:
        system_prompt = (
            f"You are a professional translator. Think deeply and translate the following to '{target_lang}'. "
            f"Detect the source language automatically and return only the translated text."
        )

    r = requests.post(
        LLM_URL,
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {TOKEN}"
        },
        json={
            "model": "qwen2.5",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            "temperature": 0
        }
    )

    return r.json()["choices"][0]["message"]["content"]

In [3]:
from models.translate import TranslateResponse, DoclingTranslationResponse
from fastapi import APIRouter, Query, Body

def doc_translate(payload: TranslateResponse = Body(...)):
    doc_id = payload.doc_id
    source_lang = payload.source_lang
    target_lang = payload.target_lang or "English"
    data = payload.message 

    for i, entry in enumerate(data.texts):
        original_text = entry.get("text") or entry.get("orig")
        if original_text:
            translated_text = translate(original_text, source_lang=source_lang, target_lang=target_lang)
            entry_dict = dict(entry) if not isinstance(entry, dict) else entry
            entry_dict["trans"] = translated_text
            data.texts[i] = entry_dict  # update the list


    for table in data.tables:
        table_data = table.get("data", {})
        table_cells = table_data.get("table_cells", [])
        for i, entry in enumerate(table_cells):
            original_text = entry.get("text")
            if original_text:
                translated_text = translate(original_text, source_lang=source_lang, target_lang=target_lang)
                entry_dict = dict(entry) if not isinstance(entry, dict) else entry
                entry_dict["trans"] = translated_text
                table_cells[i] = entry_dict

    if source_lang:
        return TranslateResponse(
            doc_id = doc_id,
            source_lang = source_lang,
            target_lang = target_lang,
            message = data
        )
    else:
        return TranslateResponse(
            doc_id = doc_id,
            target_lang = target_lang,
            message = data
        )

In [4]:
import json
from models.translate import TranslateResponse

# Load JSON file as dict
with open("./sample_json/input.json", "r") as f:
    data = json.load(f)

# Convert dict to TranslateResponse Pydantic model
payload = TranslateResponse(**data)

# Call your function directly
result = doc_translate(payload=payload)

# Print the output
print(result)


FileNotFoundError: [Errno 2] No such file or directory: './docling_translation_service/sample_json/input.json'