# Preprocessing for Docling

Docling fails at parsing large files (because Docling has timeout set for each of its tasks). Therefore Docling needs the PDFs to be split and having it compressed is an extra improvement.

In [None]:
import ghostscript
import sys
import os
import fitz
import httpx
from typing import List, Dict
import json
from tqdm import tqdm

Compression function

In [17]:
def compress_pdf(input_path, output_path, quality="screen"):
    """
    Compress a PDF using the Ghostscript Python SDK (safe for latest versions).
    """
    quality_settings = {
        "screen": "/screen",
        "ebook": "/ebook",
        "printer": "/printer",
        "prepress": "/prepress",
        "default": "/default"
    }

    if quality not in quality_settings:
        raise ValueError(f"Invalid quality setting: {quality}")

    # Double check file exists
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input file not found: {input_path}")

    args = [
        "gs",  # Just a label; not actually used
        "-sDEVICE=pdfwrite",
        "-dCompatibilityLevel=1.4",
        f"-dPDFSETTINGS={quality_settings[quality]}",
        "-dNOPAUSE",
        "-dQUIET",
        "-dBATCH",
        f"-sOutputFile={output_path}",
        input_path
    ]

    try:
        # Pass plain strings (no need to encode)
        with ghostscript.Ghostscript(*args):
            pass
        print(f"✅ Compressed PDF saved to: {output_path}")
        output_size = os.path.getsize(output_path)
        input_size = os.path.getsize(input_path)
        reduction = 100 * (1 - output_size / input_size)
        print(f"✅ Compressed PDF saved to: {output_path}")
        print(f"📦 Original size: {input_size / 1024:.2f} KB")
        print(f"📉 Compressed size: {output_size / 1024:.2f} KB")
        print(f"🧮 Compression: {reduction:.2f}%")
    except ghostscript.GhostscriptError as e:
        print(f"❌ Ghostscript compression failed: {e}")
    except Exception as ex:
        print(f"❌ Unexpected error: {ex}")

Parse Function

In [9]:
async def docling_parse(input_path: str) -> Dict:
    url = "http://localhost:5001/v1alpha/convert/file"
    parameters = {
        "from_formats": ["docx", "pptx", "html", "image", "pdf", "asciidoc", "md", "xlsx"],
        "to_formats": ["md", "json", "html", "text", "doctags"],
        "image_export_mode": "placeholder",
        "do_ocr": True,
        "force_ocr": False,
        "ocr_engine": "easyocr",
        "ocr_lang": ["en"],
        "pdf_backend": "dlparse_v2",
        "table_mode": "fast",
        "abort_on_error": False,
        "return_as_file": False
    }

    async with httpx.AsyncClient(timeout=60.0) as client:
        with open(input_path, "rb") as f:
            files = {'files': (os.path.basename(input_path), f, 'application/pdf')}
            response = await client.post(url, files=files, data=parameters)

    response.raise_for_status()
    return response.json()

Split Function

In [10]:
def split_pdf_into_chunks(input_path: str, output_dir: str, chunk_size: int = 20) -> List[Dict]:
    os.makedirs(output_dir, exist_ok=True)
    doc = fitz.open(input_path)
    base = os.path.splitext(os.path.basename(input_path))[0]

    chunks = []
    for start in range(0, len(doc), chunk_size):
        end = min(start + chunk_size, len(doc))
        chunk_doc = fitz.open()
        chunk_doc.insert_pdf(doc, from_page=start, to_page=end - 1)
        out_path = os.path.join(output_dir, f"{base}_pages_{start+1}_to_{end}.pdf")
        chunk_doc.save(out_path)
        chunks.append({
            "start_page": start + 1,
            "end_page": end,
            "file_path": out_path
        })
    print(f"✅ Split into {len(chunks)} chunks")
    return chunks

Function to link the chunks to the docling function while preserving page numbers

In [None]:
async def process_all_pages(chunks, output_jsonl_path="output.jsonl"):
    with open(output_jsonl_path, "w", encoding="utf-8") as f:
        for chunk in tqdm(chunks, desc="Parsing chunks"):
            try:
                path = chunk["file_path"]
                data = await docling_parse(path)
                result = {
                    "start_page": chunk["start_page"],
                    "end_page": chunk["end_page"],
                    "text": data,
                }
                f.write(json.dumps(result) + "\n")
            except Exception as e:
                print(f"❌ Error parsing {chunk}: {e}")

Then we test.

In [None]:
import datetime
from dotenv import load_dotenv
load_dotenv()

In [None]:
INPUT_FILENAME = os.getenv("INPUT_FILENAME", "input")
COMPRESSION_OUTPUT_FILENAME = os.getenv("COMPRESSION_OUTPUT_FILENAME", "compressed_output")
COMPRESSION_QUALITY = os.getenv("GHOSTSCRIPT_COMPRESSION_QUALITY", "ebook")
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

In [None]:
input = INPUT_FILENAME + ".pdf"
compression_output = COMPRESSION_OUTPUT_FILENAME + str(current_time) + ".pdf"

In [None]:
compress_pdf(input, compression_output, quality=COMPRESSION_QUALITY)

In [3]:
chunks = split_pdf_into_chunks(compressed_input, "chunks", chunk_size=20)

NameError: name 'split_pdf_into_chunks' is not defined

In [None]:
await process_all_pages(chunks)