In [1]:
!pip install numpy
!pip install pymupdf



In [2]:
!pip install tqdm



In [3]:
from dataclasses import dataclass, asdict
from typing import List, Dict
import json, pathlib, statistics
import sys, fitz                               # PyMuPDF
import re
import numpy as np
from collections import Counter
from tqdm import tqdm



In [4]:
@dataclass
class Span:
    text: str
    size: float
    x0: float; y0: float; x1: float; y1: float

@dataclass
class Heading:
    level: str      # "Title" | "H1" | "H2" | "H3"
    text:  str
    page:  int      # 0-based


In [5]:
def save_outline(headings: List[Heading], out_path: pathlib.Path):
    if not headings:
        raise ValueError("No headings found")
    payload = {
        "title": headings[0].text,
        "outline": [asdict(h) for h in headings[1:]]   # skip first (title)
    }
    out_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False))
    print(f" Outline written to → {out_path.resolve()}")


In [6]:
import statistics, math, re, fitz
SPACE_THRESHOLD = 0.35          # fraction of font-size that still counts as “no space”

def collect_lines(doc, max_pages=100, y_tol=2.0):
    """
    Returns list[page] → list[dict(text,size,y0,y1,....)]
    1. Groups spans that share (almost) the same baseline.
    2. Joins neighbouring spans without inserting a space when the
       horizontal gap is smaller than SPACE_THRESHOLD × font-size.
    """
    pages_lines = []
    for p in range(min(len(doc), max_pages)):
        page = doc.load_page(p)
        h    = page.rect.height
        # -- raw spans --------------------------------------------------
        spans = []
        for blk in page.get_text("dict")["blocks"]:
            if blk["type"] != 0:
                continue
            for ln in blk["lines"]:
                for sp in ln["spans"]:
                    s = sp["text"]
                    if s.strip():
                        spans.append(dict(text=s,
                                          size=sp["size"],
                                          x0=sp["bbox"][0],
                                          x1=sp["bbox"][2],
                                          y0=sp["bbox"][1]))
        # -- sort by baseline, left→right --------------------------------
        spans.sort(key=lambda r: (round(r["y0"]/y_tol)*y_tol, r["x0"]))
        # -- group by baseline -------------------------------------------
        groups, cur, cur_y = [], [], None
        for r in spans:
            if cur_y is None or abs(r["y0"]-cur_y) <= y_tol:
                cur.append(r); cur_y = r["y0"]
            else:
                groups.append(cur); cur = [r]; cur_y = r["y0"]
        if cur:
            groups.append(cur)
        # -- build full line string --------------------------------------
        page_lines = []
        for g in groups:
            g.sort(key=lambda r: r["x0"])
            pieces = [g[0]["text"]]
            for prev, nxt in zip(g, g[1:]):
                gap = nxt["x0"] - prev["x1"]
                mean_sz = (prev["size"] + nxt["size"]) / 2
                if gap > SPACE_THRESHOLD * mean_sz:
                    pieces.append(" ")
                pieces.append(nxt["text"])
            txt = "".join(pieces).strip()
            if not txt:
                continue
            page_lines.append(dict(text=txt,
                                   size=statistics.mean(r["size"] for r in g),
                                   y0=min(r["y0"] for r in g)))
        pages_lines.append(page_lines)
    return pages_lines


In [7]:
_URL   = re.compile(r"(https?://|www\.)\S+", re.I)
_EMAIL = re.compile(r"\b\S+@\S+\.\S+\b")
_PHONE = re.compile(r"\b\d[\d\s\-]{7,}\d\b")
_DASH  = re.compile(r"[-_]{4,}")

def is_sentence_like(t: str) -> bool:
    if _URL.search(t) or _EMAIL.search(t) or _PHONE.search(t):
        return False
    if _DASH.fullmatch(t.strip("-_ ")):
        return False
    if len(t) < 3:
        return False
    alpha = sum(c.isalpha() for c in t) / len(t)
    return alpha >= 0.40


In [8]:
def decide_levels(pages_lines, keep_h4=True):
    # ­-- body font ----------------------------------------------------------------
    body_sz = statistics.median(l["size"] for pg in pages_lines for l in pg)

    def cat(sz):
        r = sz / body_sz
        if   r >= 1.8:  return "H1"
        elif r >= 1.45: return "H2"
        elif r >= 1.20: return "H3"
        elif r >= 1.05 and keep_h4: return "H4"
        return None

    headings = []

    # ­-- pick “Title” only if the largest line is in the top 33 % of page-0 -------
    top_lines = [l for l in pages_lines[0] if l["y0"] < 0.33 * 792]  # 792 = US-letter-pts
    if top_lines:
        title_line = max(top_lines, key=lambda l: l["size"])
        if is_sentence_like(title_line["text"]):
            headings.append(Heading("Title", title_line["text"], 0))

    # ­-- all headings -------------------------------------------------------------
    for pi, lines in enumerate(pages_lines):
        for l in lines:
            if not is_sentence_like(l["text"]):                  continue
            lvl = cat(l["size"])
            if not lvl or len(l["text"].split()) > 40:           continue
            if headings and headings[-1].text == l["text"]:      continue
            headings.append(Heading(lvl, l["text"], pi))
    return headings


In [9]:
from dataclasses import asdict

def extract_outline(pdf_path: str, max_pages: int = 50) -> Dict:
    doc         = fitz.open(pdf_path)
    pages_lines = collect_lines(doc, max_pages)
    heads       = decide_levels(pages_lines)

    # No headings at all?
    if not heads:
        return {"title": "", "outline": []}

    # Case A: exactly one Heading, and it's marked as Title
    if len(heads) == 1 and heads[0].level == "Title":
        single = heads[0]
        return {
            "title": "",
            "outline": [
                {"level": "H1", "text": single.text, "page": single.page}
            ]
        }

    # Case B: first element is a real Title, rest are outline
    if heads[0].level == "Title":
        title_text = heads[0].text
        items = [asdict(h) for h in heads[1:]]
    else:
        # No Title—everything is outline
        title_text = ""
        items = [asdict(h) for h in heads]

    return {"title": title_text, "outline": items}


In [11]:
pdf_file = "/Users/tanmayrath/Downloads/Learn Acrobat - Share_1.pdf"
outline  = extract_outline(pdf_file)
print(json.dumps(outline, indent=2, ensure_ascii=False))


{
  "title": "",
  "outline": [
    {
      "level": "H1",
      "text": "Share",
      "page": 1
    },
    {
      "level": "H3",
      "text": "PDF sharing options",
      "page": 1
    },
    {
      "level": "H3",
      "text": "Send an anonymous or public link in an email",
      "page": 3
    },
    {
      "level": "H3",
      "text": "Mark up text",
      "page": 5
    },
    {
      "level": "H1",
      "text": "Highlight, strikethrough, or underline text",
      "page": 6
    },
    {
      "level": "H1",
      "text": "comment",
      "page": 6
    },
    {
      "level": "H3",
      "text": "Using Select Text Tool",
      "page": 6
    },
    {
      "level": "H3",
      "text": "Using Markup Tool",
      "page": 6
    },
    {
      "level": "H3",
      "text": "Change the color of the markup",
      "page": 7
    },
    {
      "level": "H3",
      "text": "Delete The Markup",
      "page": 7
    },
    {
      "level": "H3",
      "text": "Add a comment to the markup",


**The section from down here is not important pls ignore.**


In [8]:
!pip install -q PyMuPDF==1.24.1 pdf2image==1.17.0 Pillow tqdm numpy

# (optional, later) pip install -q torch transformers datasets

import fitz  # PyMuPDF
import statistics, pathlib, json
import numpy as np
from dataclasses import dataclass, asdict
from typing import List, Dict
from tqdm import tqdm


In [9]:
@dataclass
class Line:
    text: str
    size: float
    y0: float

@dataclass
class Heading:
    level: str   # Title | H1 | H2 | H3 | H4...
    text: str
    page: int    # 1-based

def save_outline(outline: Dict, pdf_path: str):
    out_path = pathlib.Path(pdf_path).with_suffix("_outline.json")
    out_path.write_text(json.dumps(outline, indent=2, ensure_ascii=False))
    print("✅  Saved:", out_path)


In [10]:
def collect_lines(doc: fitz.Document, max_pages: int = 50, y_tol: float = 1.5):
    pages = []
    for p in range(min(len(doc), max_pages)):
        raw = []
        page = doc.load_page(p)
        for blk in page.get_text("dict")["blocks"]:
            if blk["type"] != 0: continue
            for ln in blk["lines"]:
                for sp in ln["spans"]:
                    t = sp["text"].strip()
                    if not t: continue
                    raw.append((sp["bbox"][1], t, sp["size"]))
        # sort by baseline y0 (top→down)
        raw.sort(key=lambda x: x[0])
        # group into lines
        lines = []
        curr_y = None
        curr_txt = []
        curr_szs = []
        for y0, txt, sz in raw:
            if curr_y is None or abs(y0 - curr_y) <= y_tol:
                curr_txt.append(txt)
                curr_szs.append(sz)
                curr_y = y0
            else:
                lines.append(Line(
                    text=" ".join(curr_txt),
                    size=statistics.mean(curr_szs),
                    y0=curr_y
                ))
                curr_txt = [txt]
                curr_szs = [sz]
                curr_y = y0
        if curr_txt:
            lines.append(Line(text=" ".join(curr_txt),
                              size=statistics.mean(curr_szs),
                              y0=curr_y))
        pages.append(lines)
    return pages


In [11]:
def extract_outline(pdf_path: str, max_pages: int = 50) -> Dict:
    doc = fitz.open(pdf_path)
    pages = collect_lines(doc, max_pages)

    # 1) Extract multi-line Title from page 1
    first = pages[0]
    if not first:
        raise ValueError("Page 1 empty")
    max_size = max(ln.size for ln in first)
    # take all lines ≥ 90% of max_size
    title_lines = [ln for ln in first if ln.size >= 0.9 * max_size]
    # sort by y0 (top→down) and join
    title_lines.sort(key=lambda ln: ln.y0)
    title_text = " ".join(ln.text for ln in title_lines).strip()

    # remove them from the page so they don't reappear as H1
    y_set = {ln.y0 for ln in title_lines}
    pages[0] = [ln for ln in first if ln.y0 not in y_set]

    # 2) Build size ratios relative to BODY
    all_sizes = [ln.size for pg in pages for ln in pg]
    body_sz   = statistics.median(all_sizes)

    def level_of(sz: float):
        r = sz / body_sz
        if r >= 1.8:  return "H1"
        if r >= 1.45: return "H2"
        if r >= 1.20: return "H3"
        if r >= 1.05: return "H4"
        return None

    # 3) Collect headings
    raw = []
    for pi, pg in enumerate(pages):
        for ln in pg:
            lvl = level_of(ln.size)
            if not lvl: continue
            # drop very long lines
            if len(ln.text.split()) > 30: continue
            raw.append(Heading(level=lvl, text=ln.text.strip(), page=pi+1))

    # 4) Merge consecutive same-level headings
    merged = []
    for h in raw:
        if merged and merged[-1].level == h.level and merged[-1].page == h.page:
            # append text
            merged[-1].text += " " + h.text
        else:
            merged.append(h)

    # 5) Deduplicate exact repeats
    final = []
    seen = set()
    for h in merged:
        key = (h.level, h.text, h.page)
        if key in seen: continue
        seen.add(key)
        final.append(h)

    # 6) Assemble output
    outline = {
      "title": title_text,
      "outline": [asdict(h) for h in final]
    }
    return outline


In [None]:
def save_outline(outline: Dict, pdf_path: str):
    pdf_path = pathlib.Path(pdf_path).expanduser()
    out_path = pdf_path.parent / f"{pdf_path.stem}_outline.json"   # ← fixed
    out_path.write_text(json.dumps(outline, indent=2, ensure_ascii=False))
    print("Saved:", out_path)
result = extract_outline("/Users/tanmayrath/Downloads/file03.pdf")
print(json.dumps(result, indent=2, ensure_ascii=False))
save_outline(result, "/Users/tanmayrath/Downloads/file03.pdf")


{
  "title": "RFP: R RFP: Re e equest f quest f quest fo o or Pr r Pr r Proposal oposal oposal RFP: R RFP: R e quest f o r Pr oposal",
  "outline": [
    {
      "level": "H3",
      "text": "Ontario’s Libraries Working T ogether",
      "page": 1
    },
    {
      "level": "H1",
      "text": "T o Present a Proposal for Developing the Business Plan for the Ontario Digital Library March 21, 2003",
      "page": 1
    },
    {
      "level": "H1",
      "text": "Ontario’s Digital Library",
      "page": 2
    },
    {
      "level": "H3",
      "text": "A Critical Component for Implementing Ontario’s Road Map to Prosperity Strategy",
      "page": 2
    },
    {
      "level": "H4",
      "text": "Summary",
      "page": 2
    },
    {
      "level": "H4",
      "text": "Background",
      "page": 3
    },
    {
      "level": "H4",
      "text": "The Business Plan to be Developed",
      "page": 6
    },
    {
      "level": "H4",
      "text": "Milestones Approach and Specific Propos

In [15]:
from IPython.display import JSON, display
display(JSON(result))


<IPython.core.display.JSON object>

In [16]:
with open("full_outline.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)


In [17]:
import os

data_dir = "/Volumes/Extreme SSD/docbank_testing_data_gpu"

files = os.listdir(data_dir)
print(files[:10])  # just print first 10 to inspect

['.DS_Store', '._.DS_Store', 'images', '._images', 'annotations', '._annotations']


In [18]:
annotation_path = os.path.join(data_dir, "annotations")
ann_files = os.listdir(annotation_path)
print(ann_files[:5])


['20.tar_1502.01891.gz_paper_16.json', '._20.tar_1502.01891.gz_paper_16.json', '2.tar_1701.00641.gz_StringMath16_arXiv_v2_24.json', '._2.tar_1701.00641.gz_StringMath16_arXiv_v2_24.json', '195.tar_1709.00813.gz_Satisfaction_Feature_Reduction_Paper_Presubmit_7.json']


In [19]:
import json
import os

json_file = os.path.join(annotation_path, '20.tar_1502.01891.gz_paper_16.json')

with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Print first few keys/entries
print(type(data))
print(data.keys() if isinstance(data, dict) else data[:2])

<class 'list'>
[{'text': 'Proof.', 'x0': 117, 'y0': 93, 'x1': 167, 'y1': 108, 'r': 0, 'g': 0, 'b': 0, 'font_name': 'JSMVHV+CMTI12', 'label': 'paragraph', 'box': [117, 93, 167, 108]}, {'text': 'Assume,', 'x0': 177, 'y0': 93, 'x1': 247, 'y1': 108, 'r': 0, 'g': 0, 'b': 0, 'font_name': 'BBNPKH+CMR12', 'label': 'paragraph', 'box': [177, 93, 247, 108]}]


In [20]:
import pandas as pd
import json
import os

file_path = os.path.join(annotation_path, '20.tar_1502.01891.gz_paper_16.json')

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Add bbox as columns if needed
if "box" in df.columns:
    df[["x1", "y1", "x2", "y2"]] = pd.DataFrame(df["box"].tolist(), index=df.index)

# Preview
print(df[["text", "label", "x1", "y1", "x2", "y2"]].head())

             text      label   x1  y1   x2   y2
0          Proof.  paragraph  117  93  167  108
1         Assume,  paragraph  177  93  247  108
2              by  paragraph  252  93  272  108
3  contradiction,  paragraph  276  93  391  108
4            that  paragraph  396  93  431  108


In [21]:
image_path = os.path.join(data_dir, "images")
ann_files = os.listdir(image_path)
print(ann_files[:5])

['197.tar_1709.01863.gz_On_the_geometry_133_ori.jpg', '._197.tar_1709.01863.gz_On_the_geometry_133_ori.jpg', '2.tar_1601.00582.gz_extrema_logrv_jan16_13_ori.jpg', '._2.tar_1601.00582.gz_extrema_logrv_jan16_13_ori.jpg', '20.tar_1602.01139.gz_jdcgs-tcom15_V15_SJ_4_ori.jpg']


In [22]:
import os
from PIL import Image

image_dir = os.path.join(data_dir, "images")
image_files = [f for f in os.listdir(image_dir) if f.endswith(".jpg")]

# Load one image
img_path = os.path.join(image_dir, image_files[0])
img = Image.open(img_path)
img.show()  # Will preview the image in a window (Mac Preview)

In [2]:
# pdf_to_images_pymupdf.py
# pip install pymupdf pillow tqdm

import fitz                     # PyMuPDF
from pathlib import Path
from tqdm import tqdm
from PIL import Image           # just for optional re-save to PNG/JPEG

def pdf_to_images(pdf_path: str | Path,
                  out_dir: str | Path = "pages",
                  dpi: int = 200,
                  fmt: str = "png") -> None:
    pdf_path = Path(pdf_path).expanduser()
    out_dir  = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    doc = fitz.open(pdf_path)
    zoom = dpi / 72               # 72 dpi is the native PDF resolution
    matrix = fitz.Matrix(zoom, zoom)

    for page_idx in tqdm(range(len(doc)), desc="Rendering pages"):
        pix = doc.load_page(page_idx).get_pixmap(matrix=matrix)
        img_path = out_dir / f"{pdf_path.stem}_page{page_idx+1}.{fmt}"
        pix.save(img_path)        # saves directly to PNG/JPEG/TIFF

        # Optional post-processing with Pillow
        # Image.open(img_path).convert("RGB").save(img_path, quality=90)

    print(f"✓ Saved {len(doc)} images to {out_dir.resolve()}")

if __name__ == "__main__":
    pdf_to_images("/Users/tanmayrath/Downloads/file03.pdf")   # ← replace with your PDF


Rendering pages: 100%|██████████████████████████| 14/14 [00:01<00:00, 11.61it/s]

✓ Saved 14 images to /Volumes/Extreme SSD/AdobeIndiaHackathon/pages



