In [10]:
# Install necessary libraries
!pip install docling
!pip install paddleocr
!pip install pymupdf  # for fitz dependency

from paddleocr import PaddleOCR
import fitz  # PyMuPDF
from pathlib import Path
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions

# Specify the file path to your PDF
pdf_path = '/home/Redacted.pdf'  # Replace with your PDF file path

# PaddleOCR Initialization
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def convert_pdf_to_images(pdf_path):
    """
    Convert a PDF to a list of images (one image per page).
    """
    doc = fitz.open(pdf_path)
    images = []
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)  # Get the page
        pix = page.get_pixmap()  # Convert page to a Pixmap (image)
        img_path = f"/tmp/page_{page_num + 1}.png"
        pix.save(img_path)
        images.append(img_path)
    return images

def extract_text_with_paddleocr_from_pdf(pdf_path):
    """
    Use PaddleOCR to extract text from a PDF by first converting it to images.
    """
    image_paths = convert_pdf_to_images(pdf_path)
    extracted_text = []
    for image_path in image_paths:
        results = ocr.ocr(image_path)
        for line in results[0]:
            extracted_text.append(line[1][0])  # Extract text from each line
    return "\n".join(extracted_text)

def process_text_with_docling(extracted_text):
    """
    Use Docling to process extracted text for further processing (markdown export).
    """
    # Save OCR results into a text file
    temp_text_file = Path("/tmp/extracted_text.txt")
    temp_text_file.write_text(extracted_text)

    # Setup pipeline options for document conversion
    pipeline_options = PdfPipelineOptions(do_ocr=False)  # Skip OCR in docling

    # Create the document converter
    converter = DocumentConverter(format_options={InputFormat.PDF: pipeline_options})
    
    # Convert using Docling, treating the OCR output file as a simple input
    try:
        doc = converter.convert(temp_text_file).document  # Convert directly to doc
        md = doc.export_to_markdown()  # Export to markdown
        print(md)
    except StopIteration:
        print("Conversion failed: No document returned from the converter.")

def main():
    # Step 1: Extract text with PaddleOCR from the PDF
    extracted_text = extract_text_with_paddleocr_from_pdf(pdf_path)
    print("Extracted Text:\n", extracted_text)

    # Step 2: Process the extracted text with Docling
    process_text_with_docling(extracted_text)

# Run the function
main()


[0m[2024/11/18 10:27:25] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, r

In [6]:
!pip install pymupdf


Collecting pymupdf
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.24.13
[0m