# Dolphin PDF Parsing — Tier 1 (Layout Detection)
## Using Dolphin-v2 (3B) on Colab T4 GPU.

In [None]:
# 1. Clone Dolphin and install dependencies
!git clone https://github.com/bytedance/Dolphin.git

In [None]:
%cd Dolphin
!pip install -q torch==2.6.0 torchvision==0.21.0 transformers==4.51.0 accelerate==1.4.0
!pip install -q datasets==3.6.0 Levenshtein==0.27.1 albumentations==1.4.0 pymupdf==1.26
!pip install -q qwen_vl_utils matplotlib jieba opencv-python bs4
!pip install -q torchcodec==0.2 decord==0.6.0

In [None]:
# 2. Download Dolphin-v2 model (~6GB)
!huggingface-cli download ByteDance/Dolphin-v2 --local-dir ./hf_model

In [12]:
# 3. Locate the PDF and extract first 9 pages
import fitz
import glob
import os

# 1. Check specific path mentioned by user
# 2. Search recursively for any uploaded PDFs
possible_paths = [
    "../colab_ocr/2512.24601v2.pdf",  # Relative to /content/Dolphin
    "/content/colab_ocr/2512.24601v2.pdf",
    "./*.pdf",
    "../*.pdf"
]

original_pdf_name = None
for path_pattern in possible_paths:
    matches = [f for f in glob.glob(path_pattern) if "first_9_pages.pdf" not in f]
    if matches:
        original_pdf_name = matches[0]
        break

if not original_pdf_name or not os.path.exists(original_pdf_name):
    raise FileNotFoundError("Could not find your PDF. Please ensure it is uploaded and the path is correct.")

print(f"Using file: {original_pdf_name}")

# Extract first 9 pages
pdf_name = "first_9_pages.pdf"
doc = fitz.open(original_pdf_name)
new_doc = fitz.open()
num_pages = min(9, len(doc))

print(f"Original PDF has {len(doc)} pages, extracting first {num_pages} pages...")

for page_num in range(num_pages):
    new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)

new_doc.save(f"./{pdf_name}")
new_doc.close()
doc.close()

print(f"Created: ./{pdf_name} with {num_pages} pages")

Please upload your PDF file...


KeyboardInterrupt: 

In [None]:
# 4. Run Tier 1 — Layout Detection (Optimized for Colab T4)
import torch
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info
from PIL import Image
import os
import sys

# Add Dolphin repo to path to import utils
sys.path.append("./Dolphin")
from utils.utils import *

class DOLPHIN_OPTIMIZED:
    def __init__(self, model_id_or_path):
        # Load model with memory optimizations for Colab T4
        self.processor = AutoProcessor.from_pretrained(model_id_or_path)
        
        # optimizations: load in 4-bit or 8-bit if bitsandbytes is available, otherwise half precision
        # strict device_map to auto to offload to disk/cpu if needed
        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_id_or_path,
            torch_dtype=torch.float16,  # Use float16 instead of float32
            device_map="auto",          # Automatically handle device placement
            low_cpu_mem_usage=True      # Optimize CPU RAM during loading
        )
        self.model.eval()
        
        # Tokenizer setup
        self.tokenizer = self.processor.tokenizer
        self.tokenizer.padding_side = "left"

    def chat(self, prompt, image):
        # (Same chat logic as original, just adapted for single image flow here)
        images = [image]
        prompts = [prompt]
        processed_images = [resize_img(img) for img in images]
        
        all_messages = [
            [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": img},
                        {"type": "text", "text": question}
                    ],
                }
            ] for img, question in zip(processed_images, prompts)
        ]

        texts = [
            self.processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
            for msgs in all_messages
        ]

        all_image_inputs = []
        for msgs in all_messages:
            image_inputs, _ = process_vision_info(msgs)
            all_image_inputs.extend(image_inputs)

        inputs = self.processor(
            text=texts,
            images=all_image_inputs,
            padding=True,
            return_tensors="pt",
        ).to(self.model.device)

        generated_ids = self.model.generate(
            **inputs,
            max_new_tokens=4096,
            do_sample=False,
        )
        
        generated_ids_trimmed = [
            out_ids[len(in_ids):] 
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        return self.processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )[0]

# Run the processing
print("Loading Optimized Model...")
model = DOLPHIN_OPTIMIZED("./hf_model")
save_dir = "./results"

import sys
sys.path.append("./Dolphin")
from utils.utils import setup_output_dirs, save_outputs, parse_layout_string, process_coordinates

# 1. Create all necessary subdirectories (output_json, layout_visualization, etc.)
setup_output_dirs(save_dir)

import fitz
from PIL import Image
print(f"Processing {pdf_name}...")
doc = fitz.open(pdf_name)

for i, page in enumerate(doc):
    print(f"Page {i+1}/{len(doc)}")
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    # Using the original process_single_layout logic but with our model
    # We need to manually handle what process_single_layout does to ensure it uses our saving logic
    print("Parsing layout...")
    layout_results = model.chat("Parse the reading order of this document.", img)
    
    # Parse and save (reusing utils from repo)
    layout_list = parse_layout_string(layout_results)
    if not layout_list: layout_list = [([0,0,*img.size], 'distorted', [])]
    
    res = []
    for idx, (bbox, label, tags) in enumerate(layout_list):
        x1, y1, x2, y2 = process_coordinates(bbox, img)
        res.append({
            "label": label,
            "bbox": [x1, y1, x2, y2],
            "reading_order": idx,
            "tags": tags
        })
    
    save_outputs(res, img, f"page_{i+1:03d}", save_dir)
    
print("Done!")

In [None]:
# 5. Display layout visualization
from IPython.display import Image, display
import glob

for img_path in sorted(glob.glob("./results/layout_visualization/*.png")):
    print(img_path)
    display(Image(filename=img_path, width=800))

In [None]:
# 6. Inspect JSON output
import json
import glob

for json_path in sorted(glob.glob("./results/output_json/*.json")):
    print(f"\n--- {json_path} ---")
    with open(json_path) as f:
        data = json.load(f)
    for elem in data:
        print(f"  [{elem['reading_order']}] {elem['label']:>8s}  bbox={elem['bbox']}")