In [1]:
from typing import List, NamedTuple, Union, Literal, Union, Dict, Any
import random
import re
from pathlib import Path

import fitz # PyMuPDF is imported as fitz
from pydantic import BaseModel, Field

In [2]:
class Bbox(NamedTuple):
    x0: float
    y0: float
    x1: float
    y1: float

class Span(BaseModel):
    size: float
    flags: int
    font: str
    color: int
    ascender: float
    descender: float
    text: str
    origin: List[float]
    bbox: Bbox

class Line(BaseModel):
    spans: List[Span]
    wmode: int
    dir: List[float]
    bbox: Bbox

class ImageBlock(BaseModel):
    type: Literal[1]
    number: int
    xres: int

class TextBlock(BaseModel):
    type: Literal[0]
    number: int
    bbox: Bbox
    lines: List[Line]

class Page(BaseModel):
    width: float
    height: float
    blocks: List[Union[TextBlock, ImageBlock]] = Field(default_factory=list)

def model_validate(data: Dict[str, Any]) -> Page:
    return Page(blocks=data.get("blocks", []))



In [3]:
def subset(input_pdf, output_pdf, page_nums):
    input_doc = fitz.open(input_pdf) 
    doc = fitz.open()

    for page_num in page_nums:

        if page_num < 0:
            continue

        if page_num >= len(input_doc):
            continue
     
        doc.insert_pdf(input_doc, from_page=page_num, to_page=page_num)

    doc.save(output_pdf)
    
# Subsets:
# subset("./docs/ulykker.pdf", "./docs/ulykker-subset.pdf", [18,19])

In [9]:
def log_reason(reason: str, obj: Any, verbose):
    if verbose:
        print(f"Filtered ({reason})", repr(obj), flush=True)

def block_filter(block: Union[TextBlock, ImageBlock], verbose=True) -> bool:
    if not isinstance(block, TextBlock):
        log_reason("non-textblock", block, verbose)
        return False
    
    return True
    
def line_filter(line: Line, verbose=True) -> bool:
    if line.wmode != 0:
        log_reason("wmode", line, verbose)
        return False
    
    if line.dir != [1.0, 0.0]:
        log_reason("direction", line, verbose)
        return False
    
    return True

pattern_is_number = re.compile(r"^\s*\d+(?:[\,\.]\d+)*\s*$")
def span_filter(span: Span, verbose=True) -> bool:
    if span.flags & 2**0: 
        log_reason("superscript", span, verbose)
        return False
    
    if len(span.text) == 0: 
        log_reason("length", span, verbose)
        return False
    
    if span.text.isspace(): 
        log_reason("whitespace", span, verbose)
        return False
    
    if re.match(pattern_is_number, span.text):
        log_reason("numeric", span, verbose)
        return False
    
    return True

def get_font(flags):
    italic = flags & 2**1
    serif = flags & 2**2
    bold = flags & 2**4

    if serif:
        if bold and italic:
            return "times-bolditalic"
        if bold:
            return "times-bold"
        if italic:
            return "times-italic"
        return "times-roman"
    else: 
        if bold and italic:
            return "helvetica-boldoblique"
        if bold:
            return "helvetica-bold"
        if italic:
            return "helvetica-oblique"
        return "helvetica"



In [5]:
def reduce_page(page: Page) -> Page:
    # Filter blocks by rules
    blocks = list(filter(block_filter, page.blocks))
    for block in blocks:
        # Filter lines by rules
        lines = list(filter(line_filter, block.lines))
        for line in lines:
            # Filter spans by rules
            line.spans = list(filter(span_filter, line.spans))
        # After spans were filtered, check for empty lines
        lines = list(filter(lambda line: len(line.spans) > 0, lines))
        block.lines = lines
    # After lines were filtered, check for empty blocks
    blocks = list(filter(lambda block: len(block.lines) > 0, blocks))
    page.blocks = blocks

    return page
        

In [12]:
# Load Document
document_path = "./docs/Rapport.pdf"
doc = fitz.open(document_path) 

# Handle pages
pages = doc.pages()
pages = map(lambda page: page.get_text("dict", sort=True), pages)
pages = map(Page.model_validate, pages)
pages = map(reduce_page, pages)
pages = list(pages)

Filtered (non-textblock) ImageBlock(type=1, number=9, xres=96)
Filtered (non-textblock) ImageBlock(type=1, number=10, xres=96)
Filtered (non-textblock) ImageBlock(type=1, number=11, xres=96)
Filtered (non-textblock) ImageBlock(type=1, number=24, xres=96)
Filtered (non-textblock) ImageBlock(type=1, number=25, xres=96)
Filtered (superscript) Span(size=6.600000381469727, flags=1, font='ArialMT', color=1135820, ascender=0.9052734375, descender=-0.2119140625, text='[6]', origin=[381.392578125, 537.4140014648438], bbox=Bbox(x0=381.392578125, y0=531.439208984375, x1=388.7286376953125, y1=538.8126220703125))
Filtered (superscript) Span(size=12.0, flags=1, font='ArialMT', color=2105377, ascender=0.9052734375, descender=-0.2119140625, text=' ', origin=[388.7286376953125, 537.4140014648438], bbox=Bbox(x0=388.7286376953125, y0=526.5507202148438, x1=392.06195068359375, y1=544.31298828125))
Filtered (superscript) Span(size=6.600000381469727, flags=1, font='ArialMT', color=1135820, ascender=0.9052734

In [13]:
# Do annotation
for i, docpage in enumerate(doc):
    page = pages[i]
    print("PAGE")
    for block in page.blocks:
        print("\tBlock")
        random_color = (random.uniform(0.2, 1.0),random.uniform(0.2, 1.0),random.uniform(0.2, 1.0))
        for line in block.lines:
            for span in line.spans:
                print("\t\t", repr(span.text))
                docpage.add_redact_annot(
                    quad=span.bbox, 
                    text=span.text, 
                    fontname=get_font(span.flags), 
                    fontsize=span.size, 
                    text_color=fitz.sRGB_to_pdf(span.color), 
                    fill=random_color
                )
    # Apply redactions
    docpage.apply_redactions()

# Save modified document
doc_path = Path(document_path)
annotated_path =str(doc_path.parent.parent / "annotated" / doc_path.name)
doc.save(annotated_path)

PAGE
	Block
		 'KURSUSNAVN'
	Block
		 'RAPPORTTITEL'
	Block
		 'DETTE ER EN UNDERTITEL'
	Block
		 'Introduktion'
	Block
		 'US Open 2024 er en tennisturnering, der bliver spillet udendørs på hardcourt-baner i'
	Block
		 'perioden 26. august - 8. september 2024 i USTA Billie Jean King National Tennis Center i'
	Block
		 'New York City, USA. Det er den 144. udgave af mesterskabet og den fjerde og sidste grand'
	Block
		 'slam-turnering i 2024. Kvalifikationen i singlerækkerne spilles samme sted den 19. - 22.'
	Block
		 'august 2024.'
PAGE
	Block
		 'Siden Ruslands invasion af Ukraine i begyndelsen af 2022 havde tennissportens styrende'
	Block
		 'organer, WTA, ATP, ITF og de fire grand slam-turneringer, tilladt, at spillere fra Rusland og'
	Block
		 'Hviderusland fortsat kunne deltage i grand slam-turneringer samt turneringer på ATP Tour'
	Block
		 'og WTA Tour, men de kunne ikke stille op under landenes navne eller flag, og spillerne fra'
	Block
		 'de to lande deltog derfor i turnering