# Extracting Question Snippets from PDF Exams

This notebook demonstrates how to extract question snippets from a PDF exam using PyMuPDF and Pillow. Each question is detected by its number (e.g., 1., 2., 3., etc.), and the corresponding snippet is saved as an image.

## 1. Install Required Libraries

Install PyMuPDF and Pillow if you haven't already.

In [12]:
!pip install pymupdf pillow




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## 2. Define Extraction Script

Below is the script to extract question snippets from a PDF file.

In [8]:
import fitz  # PyMuPDF
import re
import os
from PIL import Image
import numpy as np

# Question and subquestion detection
#QUESTION_REGEX = re.compile(r'^\s*(\*?\s*)(\d+)\.(?:\s|$)')
#SUBQUESTION_REGEX = re.compile(r'^\s*((\d+\.){2,}|[a-zA-Z]\)|\([a-zA-Z]\)|-\s+|\*\s+|\d+\))')
QUESTION_REGEX = re.compile(r"^[0-9]+\\.$")
SUBQUESTION_REGEX = re.compile(r"^[0-9]+\\.[0-9]+$")

def is_candidate_anchor(text, x0, block_width, font_size, page_width, min_font_size=5):
    if x0 > page_width * 0.10:  # First 10% of page width
        return False
    if block_width < 100:
        return False
    if font_size < min_font_size:
        return False

    match = QUESTION_REGEX.match(text)
    if not match:
        return False

    remaining = text[match.end():].strip()
    if len(remaining) == 0:
        return False

    return True

def is_in_table(block, all_blocks, x_tol=5, y_tol=3, min_neighbors=3):
    x0, y0, x1, y1, *_ = block

    # Find blocks that align vertically within y_tol (i.e. same row)
    row_neighbors = [b for b in all_blocks if abs(b[1] - y0) <= y_tol]

    # Find blocks that align horizontally within x_tol (i.e. same column)
    col_neighbors = [b for b in all_blocks if abs(b[0] - x0) <= x_tol]

    # If many neighbors in same row and same column => table cell likely
    if len(row_neighbors) >= min_neighbors and len(col_neighbors) >= min_neighbors:
        return True

    return False


def looks_like_question_or_subquestion(block, all_blocks, page_width):
    x0, y0, x1, y1, text, *_ = block
    text = text.strip()

    print(text)
    # Must start exactly with question/subquestion pattern
    if not (QUESTION_REGEX.match(text) or SUBQUESTION_REGEX.match(text)):
        return False

    # Exclude if block looks like part of a table cell
    if is_in_table(block, all_blocks):
        return False

    # Must be near left margin
    if x0 > page_width * 0.10:
        return False

    # Optionally filter out blocks too far down the page (footer)
    if y0 > 0.6 * 842:  # assuming 842 pts page height (A4)
        return False

    return True


def is_strictly_blank(pix, white_thresh=250):
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    gray = img.convert("L")
    arr = np.array(gray)
    return np.all(arr > white_thresh)

def clip_contains_question(page, clip_rect):
    blocks = page.get_text("blocks")
    for block in blocks:
        x0, y0, x1, y1, *_ = block
        if clip_rect.y0 <= y0 <= clip_rect.y1:
            if looks_like_question_or_subquestion(block, blocks, page.rect.width):
                return True
    return False

def extract_question_snippets(pdf_path, output_dir='output_questions'):
    os.makedirs(output_dir, exist_ok=True)
    doc = fitz.open(pdf_path)

    anchors = []
    question_counts = {}

    # Step 1: Collect anchors
    for page_num, page in enumerate(doc):
        blocks = page.get_text("blocks")
        page_dict = page.get_text("dict")
        block_font_sizes = {}
        for bidx, b in enumerate(page_dict["blocks"]):
            max_font_size = 0
            for line in b.get("lines", []):
                for span in line.get("spans", []):
                    fs = span.get("size", 0)
                    max_font_size = max(max_font_size, fs)
            block_font_sizes[bidx] = max_font_size

        for bidx, block in enumerate(blocks):
            x0, y0, x1, y1, text, *rest = block
            text = text.strip()
            block_width = x1 - x0
            font_size = block_font_sizes.get(bidx, 0)

            if is_candidate_anchor(text, x0, block_width, font_size, page.rect.width):
                match = QUESTION_REGEX.match(text)
                qnum = int(match.group(2))
                anchors.append((qnum, page_num, y0))

    anchors.sort(key=lambda x: (x[1], x[2]))

    print("\nAnchors found (qnum, page_num, y0):")
    for a in anchors:
        print(a)

    # Step 2: Crop per anchor
    for i in range(len(anchors)):
        qnum, start_page, start_y = anchors[i]
        next_anchor = anchors[i + 1] if i + 1 < len(anchors) else None

        end_page = next_anchor[1] if next_anchor else len(doc) - 1
        end_y = next_anchor[2] if next_anchor else doc[end_page].rect.height

        part = 1
        page_num = start_page
        while page_num <= end_page:
            page = doc[page_num]

            if page_num == start_page:
                top = start_y
                bottom = page.rect.height if page_num != end_page else end_y
            elif page_num == end_page:
                top = 0
                bottom = end_y
            else:
                top = 0
                bottom = page.rect.height

            clip = fitz.Rect(0, top, page.rect.width, bottom)
            pix = page.get_pixmap(dpi=150, clip=clip)

            if is_strictly_blank(pix):

                print(f"Skipped blank crop: Q{qnum} page {page_num}")
                page_num += 1
                continue

            if not clip_contains_question(page, clip):
                print(f"Skipped clip with no valid question: Q{qnum} page {page_num}")
                page_num += 1
                continue

            question_counts[qnum] = question_counts.get(qnum, 0) + 1
            suffix = f"part{question_counts[qnum]}"
            output_path = os.path.join(output_dir, f"question_{qnum}_{suffix}.png")

            pix.save(output_path)
            print(f"Saved: {output_path} (contains question/subquestion)")

            part += 1
            page_num += 1


## 3. Run Extraction

Specify your PDF file path and output directory, then run the extraction.

In [9]:
fq_pdf_path = "Fisica-Quimica-A (1).pdf"
pc_pdf_path = "PC_2021v_ee.pdf"
math_pdf_path = "Matematica.pdf"
output_dir = "output_questions"
os.makedirs(output_dir, exist_ok=True)
extract_question_snippets(fq_pdf_path)


Anchors found (qnum, page_num, y0):


## 4. Review Extracted Images

The extracted question snippets will be saved as PNG files in the specified output directory.