In [49]:
import fitz  # PyMuPDF
from collections import Counter, defaultdict
import re
from statistics import median
import json

In [50]:
def is_decorative(text):
    # Heuristics to detect decorative or non-informative text
    return (
        re.fullmatch(r"[.\-_\s]{5,}", text) or  # Just dots/dashes
        len(set(text.strip())) == 1 or          # Same character repeated
        len(text.strip()) < 3 or                # Very short
        sum(c.isalpha() for c in text) < 3      # Not enough alphabetic content
    )

In [51]:
def parse_pdf_spans(doc):
    all_spans = []
    for page_num, page in enumerate(doc, start=1):
        page_height = page.rect.height
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if "lines" not in block:
                continue
            for line in block["lines"]:
                line_spans = []
                for span in line["spans"]:
                    text = span["text"].strip()
                    if not text or is_decorative(text):
                        continue
            
                    y = span["bbox"][1]
                    x = span["bbox"][0]
                    if y < 0.05 * page_height or y > 0.95 * page_height:
                        continue
            
                    is_bold = "Bold" in span["font"]
            
                    entry = {
                        "text": text,
                        "size": round(span["size"], 1),
                        "font": span["font"],
                        "page": page_num,
                        "is_bold": is_bold,
                        "y": y,
                        "x": x
                    }
                    line_spans.append(entry)
            
                # Instead of using only one span, keep all bold ones
                for s in line_spans:
                    if s["is_bold"]:
                        all_spans.append(s)

    return all_spans


In [52]:
def adjust_font_sizes(spans):
    for span in spans:
        adjusted_size = span["size"] + (4 if span["is_bold"] else 0)
        span["adjusted_size"] = round(adjusted_size, 2)
    return spans

In [53]:
def infer_dynamic_thresholds(spans):
    if not spans:
        # fallback to default values
        return 50, 20, 10
        
    x_vals = [s["x"] for s in spans]
    base_x = min(x_vals)
    indent_delta = median([x - base_x for x in x_vals if x - base_x > 0]) or 20

    y_deltas = []
    for i in range(1, len(spans)):
        a, b = spans[i - 1], spans[i]
        if a["adjusted_size"] == b["adjusted_size"] and a["page"] == b["page"]:
            y_deltas.append(abs(b["y"] - a["y"]))
    y_merge_threshold = median(y_deltas) if y_deltas else 15

    return base_x, indent_delta, y_merge_threshold

In [54]:
def map_sizes_to_levels(spans):
    sizes = [s["adjusted_size"] for s in spans]
    unique = sorted(set(sizes), reverse=True)
    size_to_level = {}

    levels = ["H1", "H2", "H3"]
    for i, level in enumerate(levels):
        if i < len(unique):
            size_to_level[unique[i]] = level

    return size_to_level


In [55]:
def build_outline(spans, size_to_level, base_x, indent_delta, y_merge_threshold):
    outline = []
    title_parts = []
    skip = set()

    for i, span in enumerate(spans):
        if i in skip:
            continue

        size = span["adjusted_size"]
        page = span["page"]
        x = span["x"]
        y = span["y"]
        text = span["text"]
        level = size_to_level.get(size)

        # Promote indented bold spans as H2 if no level
        if not level and span["is_bold"]:
            same_page_spans = [s["x"] for s in spans if s["page"] == page and s["adjusted_size"] == size]
            baseline_x = min(same_page_spans) if same_page_spans else base_x
            if x - baseline_x >= indent_delta:
                level = "H2"

        if not level:
            continue

        combined_text = text
        j = i + 1
        while j < len(spans):
            next_span = spans[j]
            if (
                next_span["page"] == page
                and next_span["adjusted_size"] == size
                and abs(next_span["y"] - y) < 10
                and abs(next_span["x"] - x) < 5
                and next_span["font"] == span["font"]
            ):
                combined_text += " " + next_span["text"]
                skip.add(j)
                y = next_span["y"]
                j += 1
            else:
                break

        # ✅ Add only the first H1 on page 1 as title
        if page == 1 and level == "H1" and not title_parts:
            title_parts.append(combined_text)

        outline.append({
            "level": level,
            "text": combined_text.strip(),
            "page": page
        })

    return title_parts, outline


In [56]:

def extract_structured_headings(pdf_path):
    doc = fitz.open(pdf_path)
    spans = parse_pdf_spans(doc)
    spans = adjust_font_sizes(spans)
    base_x, indent_delta, y_merge_threshold = infer_dynamic_thresholds(spans)
    size_to_level = map_sizes_to_levels(spans)
    title_parts, outline = build_outline(spans, size_to_level, base_x, indent_delta, y_merge_threshold)
    return {
        "title": " ".join(title_parts).strip(),
        "outline": outline
    }


In [58]:
result = extract_structured_headings( "/Users/samyuktaarocketlane/Downloads/file01.pdf")
print(json.dumps(result, indent=4))

{
    "title": "Application form for grant of LTC advance",
    "outline": [
        {
            "level": "H1",
            "text": "Application form for grant of LTC advance",
            "page": 1
        },
        {
            "level": "H2",
            "text": "S.No",
            "page": 1
        },
        {
            "level": "H2",
            "text": "Name",
            "page": 1
        },
        {
            "level": "H2",
            "text": "Age",
            "page": 1
        },
        {
            "level": "H2",
            "text": "Relationship",
            "page": 1
        },
        {
            "level": "H2",
            "text": "Date",
            "page": 1
        },
        {
            "level": "H2",
            "text": "Signature of Government Servant.",
            "page": 1
        }
    ]
}
