In [13]:
import os
import cv2
import torch
from pathlib import Path
from typing import Optional
from pdf2image import convert_from_path
from huggingface_hub import hf_hub_download
from doclayout_yolo import YOLOv10


In [14]:
class DocumentLayoutAnalyzer:
    """
    Analyzes PDF layout using DocLayout-YOLO and saves cropped elements by type.
    Props to: https://github.com/opendatalab/DocLayout-YOLO/tree/main?tab=readme-ov-file
    """

    def __init__(
        self,
        model_repo: str = "juliozhao/DocLayout-YOLO-DocStructBench",
        model_filename: str = "doclayout_yolo_docstructbench_imgsz1024.pt",
        conf_threshold: float = 0.25,
        output_dir: Optional[Path] = None,
        image_size: int = 1024,
    ):
        self.device = (
            "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        )
        model_path = hf_hub_download(repo_id=model_repo, filename=model_filename)
        self.model = YOLOv10(model_path)
        self.conf = conf_threshold
        self.imgsz = image_size

        #just needed for testing in jupyter notebook
        if output_dir is None:
            try:
                project_root = Path(__file__).resolve().parent.parent
            except NameError:
                
                project_root = Path.cwd().resolve().parent.parent
            output_dir = project_root / "storage" / "layout_outputs"

        self.base_output_dir = output_dir
        self.base_output_dir.mkdir(parents=True, exist_ok=True)

        """
        #Use this in the project style structure
        from app.core.config import STORAGE_DIR

        self.base_output_dir = Path(STORAGE_DIR) / "layout_outputs"
        self.base_output_dir.mkdir(parents=True, exist_ok=True)

        """

        print(f"[INFO] Model loaded on {self.device}")

    def analyze_pdf(self, pdf_path: str):
        """
        Analyze a PDF file and extract layout elements as image crops.

        Args:
            pdf_path: Path to the PDF file.
        """

        pdf_path = Path(pdf_path)
        doc_name = pdf_path.stem
        doc_output_dir = self.base_output_dir / doc_name
        doc_output_dir.mkdir(parents=True, exist_ok=True)


        print(f"[INFO] Analyzing PDF: {pdf_path.name}")
        pages = convert_from_path(pdf_path)
        for i, page in enumerate(pages):
            image_path = doc_output_dir / f"page_{i}.jpg"
            page.save(image_path)
            self._analyze_image(str(image_path), i, doc_output_dir)

    def _analyze_image(self, image_path: str, page_number: int, doc_output_dir: Path):
        """
        Analyze a single image using the YOLOv10 layout model.

        Args:
            image_path: Path to the image file.
            page_number: Page number for output naming.
            doc_output_dir: Folder to store outputs for this document.
        """
        print(f"[INFO] Processing page {page_number}...")
        results = self.model.predict(
            image_path,
            imgsz=self.imgsz,
            conf=self.conf,
            device=self.device,
        )

        image = cv2.imread(image_path)
        for i, det in enumerate(results[0].boxes):
            xyxy = list(map(int, det.xyxy[0].tolist()))
            cls_id = int(det.cls[0])
            label = self.model.model.names[cls_id]

            crop = image[xyxy[1]:xyxy[3], xyxy[0]:xyxy[2]]

            label_dir = doc_output_dir / label
            label_dir.mkdir(parents=True, exist_ok=True)

            out_path = label_dir / f"page{page_number}_det{i}.jpg"
            cv2.imwrite(str(out_path), crop)

        print(f"[INFO] Page {page_number} layout elements saved.")


In [8]:
analyzer = DocumentLayoutAnalyzer() 


[INFO] Model loaded on cpu


In [10]:
analyzer.analyze_pdf("../../pdfs/6TypesOfRAG.pdf")

[INFO] Analyzing PDF: 6TypesOfRAG.pdf
[INFO] Processing page 0...

image 1/1 C:\Repos\Private\SharedProjects\LLM_(cds123)\retrieval_augmented_generation\storage\layout_outputs\6TypesOfRAG\page_0.jpg: 1024x736 1 title, 1 plain text, 5 abandons, 2 figures, 1214.6ms
Speed: 9.9ms preprocess, 1214.6ms inference, 3.5ms postprocess per image at shape (1, 3, 1024, 736)
[INFO] Page 0 layout elements saved.
[INFO] Processing page 1...

image 1/1 C:\Repos\Private\SharedProjects\LLM_(cds123)\retrieval_augmented_generation\storage\layout_outputs\6TypesOfRAG\page_1.jpg: 1024x736 2 titles, 3 plain texts, 4 abandons, 1150.1ms
Speed: 8.0ms preprocess, 1150.1ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 736)
[INFO] Page 1 layout elements saved.
[INFO] Processing page 2...

image 1/1 C:\Repos\Private\SharedProjects\LLM_(cds123)\retrieval_augmented_generation\storage\layout_outputs\6TypesOfRAG\page_2.jpg: 1024x736 2 titles, 6 plain texts, 4 abandons, 1 figure, 1144.1ms
Speed: 7.5ms prepr