Skip to content

Commit

Permalink
Make hocr API experimental for now
Browse files Browse the repository at this point in the history
This commit can be reverted when we are ready to release a new version.
  • Loading branch information
jbarlow83 committed Oct 30, 2023
1 parent 580252a commit 71166f7
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 14 deletions.
8 changes: 5 additions & 3 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ tagged yet.
v15.4.0
=======

- Added new APIs to support offline editing of the final text. Specifically,
one can now generate hOCR files with OCRmyPDF, edit them with some other tool,
and then finalize the PDF.
- Added new experimental APIs to support offline editing of the final text.
Specifically, one can now generate hOCR files with OCRmyPDF, edit them with
some other tool, and then finalize the PDF. They are experimental and
subject to change, including details of how the working folder is used.
There is no command line interface.
- Code reorganization: executors, progress bars, initialization and setup.
- Fixed test coverage in cases where the coverage tool did not properly trace
into threads or subprocesses. This code was still being tested but appeared
Expand Down
8 changes: 5 additions & 3 deletions src/ocrmypdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@
configure_debug_logging,
)
from ocrmypdf._version import PROGRAM_NAME, __version__
from ocrmypdf.api import Verbosity, configure_logging, hocr_to_ocr_pdf, ocr, pdf_to_hocr
from ocrmypdf.api import (
Verbosity,
configure_logging,
ocr,
)
from ocrmypdf.exceptions import (
BadArgsError,
DpiError,
Expand Down Expand Up @@ -45,7 +49,6 @@
'ExitCode',
'ExitCodeException',
'helpers',
'hocr_to_ocr_pdf',
'hocrtransform',
'hookimpl',
'InputFileError',
Expand All @@ -55,7 +58,6 @@
'OrientationConfidence',
'OutputFileAccessError',
'PageContext',
'pdf_to_hocr',
'pdfa',
'PdfContext',
'pdfinfo',
Expand Down
10 changes: 6 additions & 4 deletions src/ocrmypdf/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def ocr( # noqa: D417
return run_pipeline(options=options, plugin_manager=plugin_manager)


def pdf_to_hocr( # noqa: D417
def _pdf_to_hocr( # noqa: D417
input_pdf: Path,
output_folder: Path,
*,
Expand Down Expand Up @@ -432,6 +432,8 @@ def pdf_to_hocr( # noqa: D417
For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.
This API is **experimental** and subject to change.
Args:
input_pdf: Input PDF file path.
output_folder: Output folder path.
Expand Down Expand Up @@ -468,7 +470,7 @@ def pdf_to_hocr( # noqa: D417
return run_hocr_pipeline(options=options, plugin_manager=plugin_manager)


def hocr_to_ocr_pdf( # noqa: D417
def _hocr_to_ocr_pdf( # noqa: D417
work_folder: Path,
output_file: Path,
*,
Expand Down Expand Up @@ -496,6 +498,8 @@ def hocr_to_ocr_pdf( # noqa: D417
For arguments not explicitly documented here, see documentation for the
equivalent command line parameter.
This API is **experimental** and subject to change.
Args:
work_folder: Work folder path, as generated by :func:`pdf_to_hocr`.
output_file: Output PDF file path.
Expand Down Expand Up @@ -543,8 +547,6 @@ def hocr_to_ocr_pdf( # noqa: D417
'get_parser',
'get_plugin_manager',
'ocr',
'pdf_to_hocr',
'run_pipeline',
'run_pipeline_cli',
'hocr_to_ocr_pdf',
]
9 changes: 5 additions & 4 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pdfminer.high_level import extract_text

import ocrmypdf
import ocrmypdf.api


def test_language_list():
Expand All @@ -29,7 +30,7 @@ def test_stream_api(resources: Path):


def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):
ocrmypdf.pdf_to_hocr(
ocrmypdf.api._pdf_to_hocr(
resources / 'multipage.pdf',
outdir,
language='eng',
Expand All @@ -40,12 +41,12 @@ def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path):
assert (outdir / '000006_ocr_hocr.hocr').exists()
assert not (outdir / '000004_ocr_hocr.hocr').exists()

ocrmypdf.hocr_to_ocr_pdf(outdir, outpdf)
ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf)
assert outpdf.exists()


def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):
ocrmypdf.pdf_to_hocr(
ocrmypdf.api._pdf_to_hocr(
resources / 'ccitt.pdf',
outdir,
language='eng',
Expand All @@ -57,7 +58,7 @@ def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):
mangled = hocr.replace('the', 'hocr')
(outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8')

ocrmypdf.hocr_to_ocr_pdf(outdir, outpdf, optimize=0)
ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0)

text = extract_text(outpdf)
assert 'hocr' in text and 'the' not in text

0 comments on commit 71166f7

Please sign in to comment.