In [56]:
import fitz  # PyMuPDF
import time
import os
from collections import defaultdict

In [57]:
output_dir = "markdown_output"
os.makedirs(output_dir, exist_ok=True)
pdf_name = "long_pdf.pdf"

In [58]:
modes = ["text", "dict", "rawdict"]

def extract_pdf_text(mode):
    doc = fitz.open(pdf_name)
    all_text = []
    for page in doc:
        text = page.get_text(mode)
        if isinstance(text, (list, dict)):
            # Convert to string for saving
            text = str(text)
        all_text.append(text)
    return "\n".join(all_text)


In [59]:
results = defaultdict(list)
last_outputs = {}

# Get page count for per-page calculations
doc = fitz.open(pdf_name)
page_count = len(doc)
doc.close()

for mode in modes:
    print(f"Running mode: {mode}")
    for i in range(20):
        start = time.time()
        output = extract_pdf_text(mode)
        duration = time.time() - start
        results[mode].append(duration)
        if i == 19:
            last_outputs[mode] = output

    # Save last output to markdown
    with open(os.path.join(output_dir, f"{mode}.md"), "w", encoding="utf-8") as f:
        f.write(last_outputs[mode])

# Calculate baseline for percentage comparison
text_times = results["text"]
text_avg_time = sum(text_times) / len(text_times)

Running mode: text
Running mode: dict
Running mode: rawdict
PDF has 104 pages
Text mode baseline: 2.27 ms/page
------------------------------------------------------------


In [64]:
print(f"PDF has {page_count} pages")
print(f"Text mode baseline: {(text_avg_time * 1000 / page_count):.2f} ms/page")
print("-" * 60)


for mode in modes:
    times = results[mode]
    avg_time = sum(times) / len(times)
    ms_per_page = (avg_time * 1000) / page_count
    percentage = (avg_time / text_avg_time) * 100
    print(f"Mode: {mode:<7} | Avg Time per Run: {avg_time:.4f} seconds | {ms_per_page:>6.2f} ms/page | {percentage:>6.1f}%")


PDF has 104 pages
Text mode baseline: 2.27 ms/page
------------------------------------------------------------
Mode: text    | Avg Time per Run: 0.2357 seconds |   2.27 ms/page |  100.0%
Mode: dict    | Avg Time per Run: 0.7178 seconds |   6.90 ms/page |  304.6%
Mode: rawdict | Avg Time per Run: 1.8684 seconds |  17.97 ms/page |  792.8%
