In [1]:
import os
import cv2
import torch
from pathlib import Path
from typing import Optional
from pdf2image import convert_from_path
from huggingface_hub import hf_hub_download
from doclayout_yolo import YOLOv10

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DocumentLayoutAnalyzer:
    """
    Analyzes PDF layout using DocLayout-YOLO and saves cropped elements by type.
    Props to: https://github.com/opendatalab/DocLayout-YOLO/tree/main?tab=readme-ov-file
    """

    def __init__(
        self,
        model_repo: str = "juliozhao/DocLayout-YOLO-DocStructBench",
        model_filename: str = "doclayout_yolo_docstructbench_imgsz1024.pt",
        output_dir: str = "layout_outputs",
        conf_threshold: float = 0.25,
        image_size: int = 1024,
    ):
        self.device = (
            "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        )
        model_path = hf_hub_download(repo_id=model_repo, filename=model_filename)
        self.model = YOLOv10(model_path)
        self.output_dir = Path(output_dir)
        self.conf = conf_threshold
        self.imgsz = image_size
        self.output_dir.mkdir(parents=True, exist_ok=True)

        print(f"[INFO] Model loaded on {self.device}")

    def analyze_pdf(self, pdf_path: str):
        """
        Analyze a PDF file and extract layout elements as image crops.

        Args:
            pdf_path: Path to the PDF file.
        """
        print(f"[INFO] Analyzing PDF: {pdf_path}")
        pages = convert_from_path(pdf_path)
        for i, page in enumerate(pages):
            image_path = self.output_dir / f"page_{i}.jpg"
            page.save(image_path)
            self._analyze_image(str(image_path), i)

    def _analyze_image(self, image_path: str, page_number: int):
        """
        Analyze a single image using the YOLOv10 layout model.

        Args:
            image_path: Path to the image file.
            page_number: Page number for output naming.
        """
        print(f"[INFO] Processing page {page_number}...")
        results = self.model.predict(
            image_path,
            imgsz=self.imgsz,
            conf=self.conf,
            device=self.device,
        )

        image = cv2.imread(image_path)
        for i, det in enumerate(results[0].boxes):
            xyxy = list(map(int, det.xyxy[0].tolist()))
            cls_id = int(det.cls[0])
            label = self.model.model.names[cls_id]

            crop = image[xyxy[1]:xyxy[3], xyxy[0]:xyxy[2]]

            label_dir = self.output_dir / label
            label_dir.mkdir(parents=True, exist_ok=True)

            out_path = label_dir / f"page{page_number}_det{i}.jpg"
            cv2.imwrite(str(out_path), crop)

        print(f"[INFO] Page {page_number} layout elements saved.")


In [3]:
analyzer = DocumentLayoutAnalyzer() 


[INFO] Model loaded on mps


In [4]:
analyzer.analyze_pdf("../../pdfs/Test2.pdf")

[INFO] Analyzing PDF: ../../pdfs/Test2.pdf
[INFO] Processing page 0...

image 1/1 /Users/sangeeths/Develop/retrieval_augmented_generation/app/document_processing/layout_outputs/page_0.jpg: 1024x736 3 titles, 8 plain texts, 4 abandons, 488.1ms
Speed: 43.3ms preprocess, 488.1ms inference, 2864.5ms postprocess per image at shape (1, 3, 1024, 736)
[INFO] Page 0 layout elements saved.
[INFO] Processing page 1...

image 1/1 /Users/sangeeths/Develop/retrieval_augmented_generation/app/document_processing/layout_outputs/page_1.jpg: 1024x736 2 titles, 10 plain texts, 2 abandons, 44.4ms
Speed: 22.1ms preprocess, 44.4ms inference, 10.0ms postprocess per image at shape (1, 3, 1024, 736)
[INFO] Page 1 layout elements saved.
[INFO] Processing page 2...

image 1/1 /Users/sangeeths/Develop/retrieval_augmented_generation/app/document_processing/layout_outputs/page_2.jpg: 1024x736 1 title, 8 plain texts, 2 abandons, 1 figure, 1 figure_caption, 2 isolate_formulas, 33.9ms
Speed: 3.9ms preprocess, 33.9ms in