Skip to content

Commit

Permalink
pipeline: improve documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Jun 13, 2023
1 parent bf0224f commit 90619b3
Showing 1 changed file with 58 additions and 6 deletions.
64 changes: 58 additions & 6 deletions src/ocrmypdf/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from ocrmypdf.hocrtransform import HocrTransform
from ocrmypdf.pdfa import generate_pdfa_ps
from ocrmypdf.pdfinfo import Colorspace, Encoding, PageInfo, PdfInfo
from ocrmypdf.pluginspec import OrientationConfidence

# Remove this workaround when we require Pillow >= 10
try:
Expand All @@ -51,6 +52,20 @@


def triage_image_file(input_file: Path, output_file: Path, options) -> None:
"""Triage the input image file.
If the input file is an image, check its resolution and convert it to PDF.
Args:
input_file: The path to the input file.
output_file: The path to the output file.
options: An object containing the options passed to the OCRmyPDF command.
Raises:
UnsupportedImageFormatError: If the input file is not a supported image format.
DpiError: If the input image has no resolution (DPI) in its metadata or if the
resolution is not credible.
"""
log.info("Input file is not a PDF, checking if it is an image...")
try:
im = Image.open(input_file)
Expand Down Expand Up @@ -208,6 +223,7 @@ def validate_pdfinfo_options(context: PdfContext) -> None:


def _vector_page_dpi(pageinfo: PageInfo) -> int:
"""Get a DPI to use for vector pages, if the page has vector content."""
return VECTOR_PAGE_DPI if pageinfo.has_vector or pageinfo.has_text else 0


Expand Down Expand Up @@ -264,6 +280,7 @@ def get_canvas_square_dpi(pageinfo: PageInfo, options) -> Resolution:


def is_ocr_required(page_context: PageContext) -> bool:
"""Check if the page needs to be OCR'd."""
pageinfo = page_context.pageinfo
options = page_context.options

Expand Down Expand Up @@ -339,6 +356,7 @@ def is_ocr_required(page_context: PageContext) -> bool:


def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
"""Generate a lower quality preview image."""
output_file = page_context.get_path('rasterize_preview.jpg')
canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options)
page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
Expand All @@ -356,8 +374,10 @@ def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
return output_file


def describe_rotation(page_context: PageContext, orient_conf, correction: int) -> str:
"""Describe the page rotation we are going to perform."""
def describe_rotation(
page_context: PageContext, orient_conf: OrientationConfidence, correction: int
) -> str:
"""Describe the page rotation we are going to perform (or not perform)."""
direction = {0: '⇧', 90: '⇨', 180: '⇩', 270: '⇦'}
turns = {0: ' ', 90: '⬏', 180: '↻', 270: '⬑'}

Expand All @@ -384,7 +404,7 @@ def describe_rotation(page_context: PageContext, orient_conf, correction: int) -


def get_orientation_correction(preview: Path, page_context: PageContext) -> int:
"""Work out orientation correct for each page.
"""Work out orientation correction for each page.
We ask Ghostscript to draw a preview page, which will rasterize with the
current /Rotate applied, and then ask OCR which way the page is
Expand Down Expand Up @@ -418,8 +438,22 @@ def rasterize(
page_context: PageContext,
correction: int = 0,
output_tag: str = '',
remove_vectors=None,
):
remove_vectors: bool | None = None,
) -> Path:
"""Rasterize a PDF page to a PNG image.
Args:
input_file: The input PDF file path.
page_context: The page context object.
correction: The orientation correction angle. Defaults to 0.
output_tag: The output tag. Defaults to ''.
remove_vectors: Whether to remove vectors. Defaults to None, which means
the value from the page context options will be used. If the value
is True or False, it will override the page context options.
Returns:
Path: The output PNG file path.
"""
colorspaces = ['pngmono', 'pnggray', 'png256', 'png16m']
device_idx = 0

Expand Down Expand Up @@ -480,6 +514,15 @@ def preprocess_remove_background(input_file: Path, page_context: PageContext) ->


def preprocess_deskew(input_file: Path, page_context: PageContext) -> Path:
"""Deskews the input image using the OCR engine and saves the output to a file.
Args:
input_file: The input image file to deskew.
page_context: The context of the page being processed.
Returns:
Path: The path to the deskewed image file.
"""
output_file = page_context.get_path('pp_deskew.png')
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)

Expand Down Expand Up @@ -575,7 +618,16 @@ def ocr_engine_hocr(input_file: Path, page_context: PageContext) -> tuple[Path,


def should_visible_page_image_use_jpg(pageinfo: PageInfo) -> bool:
# If all images were JPEGs originally, produce a JPEG as output
"""Determines whether the visible page image should be saved as a JPEG.
If all images were JPEGs originally, permit a JPEG as output.
Args:
pageinfo: The PageInfo object containing information about the page.
Returns:
A boolean indicating whether the visible page image should be saved as a JPEG.
"""
return bool(pageinfo.images) and all(
im.enc == Encoding.jpeg for im in pageinfo.images
)
Expand Down

0 comments on commit 90619b3

Please sign in to comment.