Skip to content

Commit

Permalink
Fix issue #147: unpaper loses DPI information, affects —pdf-renderer …
Browse files Browse the repository at this point in the history
…tess4
  • Loading branch information
James R. Barlow committed Mar 24, 2017
1 parent a3e26e0 commit 8ddbe81
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 7 deletions.
8 changes: 8 additions & 0 deletions RELEASE_NOTES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ RELEASE NOTES
OCRmyPDF uses `semantic versioning <http://semver.org/>`_.


v4.5.2
======

- Fix issue #147, unpaper loses DPI information, which affects PDF rendering with ``--pdf-renderer tess4``
- Make "using Tesseract 4.0" warning less ominous
- Set up machinery for homebrew OCRmyPDF tap


v4.5.1
======

Expand Down
3 changes: 2 additions & 1 deletion ocrmypdf/exec/unpaper.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ def run(input_file, output_file, dpi, log, mode_args):
raise e from e
else:
log.debug(stdout)
Image.open(output_pnm.name).save(output_file)
# unpaper sets dpi to 72
Image.open(output_pnm.name).save(output_file, dpi=(dpi, dpi))


def deskew(input_file, output_file, dpi, log):
Expand Down
10 changes: 9 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def check_ocrmypdf(input_file, output_file, *args, env=None):

p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
#print(err) # ensure py.test collects the output, use -s to view
assert p.returncode == 0
assert p.returncode == 0, "<stderr>\n" + err + "\n</stderr>"
assert os.path.exists(str(output_file)), "Output file not created"
assert os.stat(str(output_file)).st_size > 100, "PDF too small or empty"
assert out == "", \
Expand All @@ -125,3 +125,11 @@ def run_ocrmypdf(input_file, output_file, *args, env=None):
#print(err)

return p, out, err


@pytest.helpers.register
def first_page_dimensions(pdf):
from ocrmypdf import pageinfo
info = pageinfo.pdf_get_all_pageinfo(str(pdf))
page0 = info[0]
return (page0['width_inches'], page0['height_inches'])
25 changes: 25 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -812,3 +812,28 @@ def test_form_xobject(spoof_tesseract_noop, resources, outpdf):
check_ocrmypdf(resources / 'formxobject.pdf', outpdf,
'--force-ocr',
env=spoof_tesseract_noop)


@pytest.mark.skipif(sys.version_info < (3, 5), reason="needs math.isclose")
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
])
def test_pagesize_consistency(renderer, resources, outpdf):
from math import isclose

first_page_dimensions = pytest.helpers.first_page_dimensions

infile = resources / 'linn.pdf'

before_dims = first_page_dimensions(infile)

check_ocrmypdf(
infile,
outpdf, '--pdf-renderer', renderer,
'--clean', '--deskew', '--remove-background', '--clean-final')

after_dims = first_page_dimensions(outpdf)

assert isclose(before_dims[0], after_dims[0])
assert isclose(before_dims[1], after_dims[1])
31 changes: 26 additions & 5 deletions tests/test_tess4.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,40 @@
import pytest
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exec import tesseract
from ocrmypdf import pageinfo
import sys


# Skip all tests in this file if not tesseract 4
pytestmark = pytest.mark.skipif(not tesseract.v4(),
reason="tesseract 4.0 required")
pytestmark = pytest.mark.skipif(
not (tesseract.v4() and tesseract.has_textonly_pdf()),
reason="tesseract 4.0 with textonly_pdf feature required")

check_ocrmypdf = pytest.helpers.check_ocrmypdf
run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof


@pytest.mark.skipif(not tesseract.has_textonly_pdf(),
reason="requires textonly_pdf feature")
def test_textonly_pdf(resources, outdir):
pytest.helpers.check_ocrmypdf(
check_ocrmypdf(
resources / 'linn.pdf',
outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4')


@pytest.mark.skipif(sys.version_info < (3, 5), reason="needs math.isclose")
def test_pagesize_consistency_tess4(resources, outpdf):
from math import isclose

infile = resources / 'linn.pdf'

before_dims = pytest.helpers.first_page_dimensions(infile)

check_ocrmypdf(
infile,
outpdf, '--pdf-renderer', 'tess4',
'--clean', '--deskew', '--remove-background', '--clean-final')

after_dims = pytest.helpers.first_page_dimensions(outpdf)

assert isclose(before_dims[0], after_dims[0])
assert isclose(before_dims[1], after_dims[1])

0 comments on commit 8ddbe81

Please sign in to comment.