In [4]:
!pip install pymupdf sentence-transformers pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.1-py3-non

In [17]:
from google.colab import files

uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
print(f"✅ Uploaded: {pdf_path}")


Saving file02.pdf to file02.pdf
✅ Uploaded: file02.pdf


In [23]:
import pdfplumber
from collections import defaultdict

def extract_lines_with_fonts(pdf_path, max_pages=50):
    lines = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages[:max_pages]):
            words = page.extract_words(extra_attrs=["fontname", "size", "x0", "top"])
            grouped = defaultdict(list)
            for word in words:
                key = round(word["top"], 1)
                grouped[key].append(word)
            for y, word_group in grouped.items():
                text = " ".join(w["text"] for w in sorted(word_group, key=lambda x: x["x0"]))
                font_size = sum(w["size"] for w in word_group) / len(word_group)
                font_name = max(set([w["fontname"] for w in word_group]), key=[w["fontname"] for w in word_group].count)
                lines.append({
                    "text": text.strip(),
                    "fontsize": font_size,
                    "fontname": font_name,
                    "y": y,
                    "page": i + 1
                })
    return lines


In [42]:

# 🧠 Heading detection
import re
from typing import List, Dict
from collections import Counter
from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer('all-MiniLM-L6-v2')

def clean_repeating_chars(text: str) -> str:
    return re.sub(r'(.)\1{2,}', r'\1', text)

def fix_spacing(text: str) -> str:
    if re.search(r'\b(\w\s){4,}\w\b', text):
        return re.sub(r'(?<=\w)\s(?=\w)', '', text)
    return text
def normalize_heading_numbering(text: str) -> str:
    return re.sub(r'(\d+)\s*\.\s*(\d+)', r'\1.\2', text)

def is_probable_heading(text: str) -> bool:
    if not text:
        return False

    text = text.strip()
    words = text.split()

    # 1. Too short or too long
    if len(words) < 2 or len(words) > 14:
        return False

    # 2. Reject if it's only digits or only section numbering like "1.2.3"
    if re.fullmatch(r'[\d\.]+', text):
        return False

    # 3. Starts with acceptable heading patterns (e.g., numbered or uppercase)
    if not re.match(r'^[A-Z0-9]', text):
        return False

    # 4. All lowercase — not a heading
    if text.islower():
        return False

    # 5. Too many special characters
    if re.search(r'[^\w\s\.\-:]', text):  # symbols like ©, ®, @
        return False

    # 6. Looks like a paragraph (e.g., many commas/periods)
    if text.count('.') > 4 or text.count(',') > 4:
        return False

    # 7. Junk OCR: extremely long words
    if any(len(word) > 25 for word in words):
        return False

    # 8. Ends like a sentence (only if long)
    if text.endswith('.') and len(words) > 3:
        return False

    return True


def get_heading_level(text: str) -> str:
    """
    Detects heading levels like:
    - '1', '1.2', '2 . 4' → H1, H2, H2
    - '3 . 2 . 1' → H3
    - fallback → H1
    """
    text = text.strip()

    # Normalize spacing: turn "4 . 1" → "4.1"
    normalized = re.sub(r'\s*[\.\-]\s*', '.', text)

    # Match pattern at start of line
    match = re.match(r'^(\d+(?:\.\d+)*)([\.\-\)])?\s*', normalized)
    if match:
        level = match.group(1).count('.') + 1
        return f"H{level}"

    # Fallback match for common headings
    if re.match(r'^(Appendix|Section|Chapter)\s+[A-Z0-9]+', text, re.IGNORECASE):
        return "H2"

    return "H1"




KNOWN_HEADING_KEYWORDS = [
    "Introduction", "Background", "Summary", "Objective", "Goals", "Timeline", "Scope",
    "Evaluation", "Methodology", "References", "Appendix", "Conclusion", "Acknowledgements",
    "Membership", "Terms of Reference"
]
heading_embeddings = model.encode(KNOWN_HEADING_KEYWORDS, convert_to_tensor=True)

def is_semantically_heading(text: str) -> bool:
    emb = model.encode(text, convert_to_tensor=True)
    sim = util.cos_sim(emb, heading_embeddings).max().item()
    return sim >= 0.45

def detect_headings(lines: List[Dict]) -> Dict:
    headings = []
    seen = set()


    # Improved: Get full title by combining all largest-font lines on page 1
    page1_lines = [l for l in lines if l["page"] == 1]
    if page1_lines:
       max_font = max(l["fontsize"] for l in page1_lines)
       title_candidates = [l for l in page1_lines if abs(l["fontsize"] - max_font) < 0.5]
       title_text = " ".join(clean_repeating_chars(fix_spacing(l["text"])) for l in title_candidates)
       title = title_text.strip()
    else:
       title = "Untitled"


    for line in lines:
        raw_text = line["text"]
        clean_text = clean_repeating_chars(fix_spacing(raw_text.strip()))

        if not clean_text or clean_text in seen:
            continue

        # Filter out unlikely headings
        if not is_probable_heading(clean_text):
            continue

        # Determine heading level (e.g., H1, H2)
        level = get_heading_level(clean_text)

        # Heuristic match: numbered or semantically heading-like
        is_numbered = re.match(r'^(\d+(\.\d+)*)([\.\)\s:-])', clean_text.strip()) is not None
        sem_match = is_semantically_heading(clean_text)

        if is_numbered or sem_match:
            headings.append({
                "level": level,
                "text": clean_text,
                "page": line["page"] - 1  # zero-based page indexing
            })
            seen.add(clean_text)

    return {
        "title": title.strip(),
        "outline": headings
    }


# 📝 Extract using PyMuPDF (for alternative font-based extraction)
import fitz  # PyMuPDF
def extract_pdf_lines(pdf_path: str) -> List[Dict]:
    doc = fitz.open(pdf_path)
    lines = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if "lines" not in b:
                continue
            for line in b["lines"]:
                line_text = ""
                font_sizes = []
                for span in line["spans"]:
                    line_text += span["text"]
                    font_sizes.append(span["size"])
                if line_text.strip():
                    lines.append({
                        "text": line_text.strip(),
                        "fontsize": sum(font_sizes) / len(font_sizes),
                        "page": page_num + 1
                    })
    return lines

In [43]:
import json
def save_outline_json(title, headings, output_file="outline.json"):
    data = {
        "title": title,
        "outline": headings
    }
    with open(output_file, "w") as f:
        json.dump(data, f, indent=2)
    print(f"✅ Saved as {output_file}")
    return output_file


In [44]:
lines = extract_lines_with_fonts(pdf_path)
result = detect_headings(lines)
json_file = save_outline_json(result["title"], result["outline"])
files.download(json_file)

✅ Saved as outline.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>