Skip to content

Commit

Permalink
Fix issue #137 - proportions of non-square resolution distorted
Browse files Browse the repository at this point in the history
Distortion mainly affected —force-ocr
  • Loading branch information
James R. Barlow committed Feb 27, 2017
1 parent 7b94129 commit 7cd2770
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 5 deletions.
6 changes: 6 additions & 0 deletions RELEASE_NOTES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ RELEASE NOTES
OCRmyPDF uses `semantic versioning <http://semver.org/>`_.


v4.5.1
======

- Fix issue #137, proportions of images with a non-square pixel aspect ratio would be distorted in output for ``--force-ocr`` and some other combinations of flags


v4.5
====

Expand Down
15 changes: 10 additions & 5 deletions ocrmypdf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,16 +555,21 @@ def select_image_layer(
re_symlink(page_pdf, output_file, log)
else:
pageinfo = get_pageinfo(image, context)
dpi = get_page_dpi(pageinfo, options)
dpi = float(dpi[0]), float(dpi[1])
layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi)

# We rasterize a square DPI version of each page because most image
# processing tools don't support rectangular DPI. Use the square DPI
# as it accurately describes the image. It would be possible to
# resample the image at this stage back to non-square DPI to more
# closely resemble the input, except that the hocr renderer does not
# understand non-square DPI. The tess4 renderer would be fine.
dpi = get_page_square_dpi(pageinfo, options)
layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))

with open(image, 'rb') as imfile, \
open(output_file, 'wb') as pdf:
rawdata = imfile.read()
log.debug('{:4d}: convert'.format(page_number(page_pdf)))
img2pdf.convert(
rawdata, with_pdfrw=False,
imfile, with_pdfrw=False,
layout_fun=layout_fun, outputstream=pdf)
log.debug('{:4d}: convert done'.format(page_number(page_pdf)))

Expand Down
40 changes: 40 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,46 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']


@pytest.mark.skipif(sys.version_info < (3, 5), reason="needs math.isclose")
@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract'
])
def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
resources, outpdf):
from math import isclose

# Confirm input image is non-square resolution
in_pageinfo = pdf_get_all_pageinfo(str(resources / 'aspect.pdf'))
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']

# --force-ocr requires means forced conversion to square resolution
check_ocrmypdf(
resources / 'aspect.pdf', outpdf,
'--force-ocr',
'--pdf-renderer', renderer, env=spoof_tesseract_cache)

out_pageinfo = pdf_get_all_pageinfo(str(outpdf))

in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]

# Resolution show now be equal
assert out_p0['xres'] == out_p0['yres']

# Page size should match input page size
assert isclose(in_p0['width_inches'],
out_p0['width_inches'])
assert isclose(in_p0['height_inches'],
out_p0['height_inches'])

# Because we rasterized the page to produce a new image, it should occupy
# the entire page
out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w']
out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h']
assert isclose(out_p0['width_inches'], out_im_w)
assert isclose(out_p0['height_inches'], out_im_h)


def test_image_to_pdf(spoof_tesseract_noop, resources, outpdf):
check_ocrmypdf(
resources / 'LinnSequencer.jpg', outpdf, '--image-dpi', '200',
Expand Down

0 comments on commit 7cd2770

Please sign in to comment.