Fix issue #137 - proportions of non-square resolution distorted

Distortion mainly affected —force-ocr
ocrmypdf · Feb 27, 2017 · 7cd2770 · 7cd2770
1 parent 7b94129
commit 7cd2770
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 5 deletions.
diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst
@@ -4,6 +4,12 @@ RELEASE NOTES
 OCRmyPDF uses `semantic versioning <http://semver.org/>`_.
 
 
+v4.5.1
+======
+
+-  Fix issue #137, proportions of images with a non-square pixel aspect ratio would be distorted in output for ``--force-ocr`` and some other combinations of flags
+
+
 v4.5
 ====
 

diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py
@@ -555,16 +555,21 @@ def select_image_layer(
         re_symlink(page_pdf, output_file, log)
     else:
         pageinfo = get_pageinfo(image, context)
-        dpi = get_page_dpi(pageinfo, options)
-        dpi = float(dpi[0]), float(dpi[1])
-        layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi)
+
+        # We rasterize a square DPI version of each page because most image
+        # processing tools don't support rectangular DPI. Use the square DPI
+        # as it accurately describes the image. It would be possible to
+        # resample the image at this stage back to non-square DPI to more
+        # closely resemble the input, except that the hocr renderer does not
+        # understand non-square DPI. The tess4 renderer would be fine.
+        dpi = get_page_square_dpi(pageinfo, options)
+        layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))
 
         with open(image, 'rb') as imfile, \
                 open(output_file, 'wb') as pdf:
-            rawdata = imfile.read()
             log.debug('{:4d}: convert'.format(page_number(page_pdf)))
             img2pdf.convert(
-                rawdata, with_pdfrw=False,
+                imfile, with_pdfrw=False,
                 layout_fun=layout_fun, outputstream=pdf)
             log.debug('{:4d}: convert done'.format(page_number(page_pdf)))
 

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -564,6 +564,46 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
     assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
 
 
+@pytest.mark.skipif(sys.version_info < (3, 5), reason="needs math.isclose")
+@pytest.mark.parametrize('renderer', [
+    'hocr',
+    'tesseract'
+    ])
+def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
+                                      resources, outpdf):
+    from math import isclose
+
+    # Confirm input image is non-square resolution
+    in_pageinfo = pdf_get_all_pageinfo(str(resources / 'aspect.pdf'))
+    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
+
+    # --force-ocr requires means forced conversion to square resolution
+    check_ocrmypdf(
+        resources / 'aspect.pdf', outpdf,
+        '--force-ocr',
+        '--pdf-renderer', renderer, env=spoof_tesseract_cache)
+
+    out_pageinfo = pdf_get_all_pageinfo(str(outpdf))
+
+    in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]
+
+    # Resolution show now be equal
+    assert out_p0['xres'] == out_p0['yres']
+
+    # Page size should match input page size
+    assert isclose(in_p0['width_inches'],
+                   out_p0['width_inches'])
+    assert isclose(in_p0['height_inches'],
+                   out_p0['height_inches'])
+
+    # Because we rasterized the page to produce a new image, it should occupy
+    # the entire page
+    out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w']
+    out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h']
+    assert isclose(out_p0['width_inches'], out_im_w)
+    assert isclose(out_p0['height_inches'], out_im_h)
+
+
 def test_image_to_pdf(spoof_tesseract_noop, resources, outpdf):
     check_ocrmypdf(
         resources / 'LinnSequencer.jpg', outpdf, '--image-dpi', '200',