In [None]:
from fastapi import FastAPI, UploadFile, File, HTTPException
import uvicorn
from pathlib import Path
import time
import os
import asyncio
from concurrent.futures import ThreadPoolExecutor
from PyPDF2 import PdfReader, PdfWriter
import pdfplumber
from typing import List, Dict
from functools import partial
import aiofiles
from docling.datamodel import vlm_model_specs
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, ImageFormatOption, XMLJatsFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

# Create thread pool for CPU-intensive operations
thread_pool = ThreadPoolExecutor(max_workers=4)
app = FastAPI(title="Docling Parser - OCR")

async def delete_temp_files():
    temp_files = [f for f in os.listdir('.') if f.startswith('temp_')]
    for temp_file in temp_files:
        try:
            await asyncio.to_thread(os.remove, temp_file)
        except Exception:
            pass

def has_images(pdf_path: str, page_number: int) -> bool:
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_number]
        return len(page.images) > 0

async def process_single_page(page_num: int, reader: PdfReader, file_name: str) -> str:
    # Create a new PDF with single page
    writer = PdfWriter()
    writer.add_page(reader.pages[page_num])
    single_page_path = Path(f"./temp_page_{page_num}_{file_name}")
    
    # Write PDF synchronously since PdfWriter doesn't support async operations
    with open(single_page_path, "wb") as out_pdf:
        writer.write(out_pdf)
    
    try:
        # Check if page contains images using thread pool
        has_imgs = await asyncio.to_thread(has_images, str(single_page_path), 0)
        
        if has_imgs:
            vlm = VlmPipelineOptions(vlm_options=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS)
            converter = DocumentConverter(format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=vlm),
            })
        else:
            converter = DocumentConverter(format_options={
                InputFormat.PDF: PdfFormatOption(),
            })
        
        # Run conversion in thread pool
        result = await asyncio.to_thread(converter.convert, str(single_page_path))
        page_markdown = result.document.export_to_markdown()
        return f"\n\n### Page {page_num + 1}\n\n{page_markdown}"
    
    except Exception as e:
        return f"\n\n### Page {page_num + 1}\n\nError processing page: {str(e)}"
    finally:
        # Clean up the single page file
        try:
            os.unlink(single_page_path)
        except:
            pass
file = """data/RAG paper IMP read.pdf"""
    
if file:
    start = time.time()
    temp_file_path = Path(f"./temp_{file}")
    
    # Save uploaded file
    async with aiofiles.open(temp_file_path, "wb") as f:
        content = await file.read()
        await f.write(content)

    markdown_output = ""
    if file.filename.lower().endswith(".pdf"):
        reader = await asyncio.to_thread(PdfReader, str(temp_file_path))
        num_pages = len(reader.pages)
        
        # Process pages concurrently
        tasks = [
            process_single_page(page_num, reader, file.filename)
            for page_num in range(num_pages)
        ]
        results = await asyncio.gather(*tasks)
        markdown_output = "".join(results)
        
    else:
        # Non-PDF files process as before
        vlm = VlmPipelineOptions(vlm_options=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS)
        converter = DocumentConverter(format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=vlm),
            InputFormat.IMAGE: ImageFormatOption(pipeline_cls=VlmPipeline, pipeline_options=vlm),
            InputFormat.XML_JATS: XMLJatsFormatOption(),
        })
        result = await asyncio.to_thread(converter.convert, str(temp_file_path))
        markdown_output = result.document.export_to_markdown()

    end = time.time()
    await asyncio.to_thread(temp_file_path.unlink, missing_ok=True)
    total = end - start
    
    result = {
        "markdown": markdown_output.strip(),
        "time_taken": total,
        "pages_processed": num_pages if file.filename.lower().endswith(".pdf") else 1 }

In [3]:
result

NameError: name 'result' is not defined