Reject high Unicode metadata at command line

Ghostscript 9.21 does not seem to accept Unicode above U+FFFF. Previous versions did, but it now exits with a rangecheck error (-15). Reject on the command line for now. Complete fix would also need to check input PDF’s metadata.
ocrmypdf · Mar 28, 2017 · 88ef271 · 88ef271
1 parent e71e8ca
commit 88ef271
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 1 deletion.
diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py
@@ -53,7 +53,6 @@ def complain(message):
             MINIMUM_TESS_VERSION, tesseract.version()))
     sys.exit(ExitCode.missing_dependency)
 
-
 # -------------
 # Parser
 
@@ -344,13 +343,31 @@ def check_options_advanced(options, log):
             "commit 3d9fb3b or later")
 
 
+def check_options_metadata(options, log):
+    import unicodedata
+    metadata = [options.title, options.author, options.keywords,
+                options.subject]
+    for s in (m for m in metadata if m):
+        for c in s:
+            if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
+                raise ValueError(
+                    "One of the metadata strings contains "
+                    "an unsupported Unicode character: '{}' (U+{})".format(
+                        c, hex(ord(c))[2:].upper()
+                ))
+
+
 def check_options(options, log):
     try:
         check_options_languages(options, log)
+        check_options_metadata(options, log)
         check_options_output(options, log)
         check_options_preprocessing(options, log)
         check_options_ocr_behavior(options, log)
         check_options_advanced(options, log)
+    except ValueError as e:
+        log.error(e)
+        sys.exit(ExitCode.bad_args)
     except argparse.ArgumentError as e:
         log.error(e)
         sys.exit(ExitCode.bad_args)

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -195,6 +195,22 @@ def test_override_metadata(spoof_tesseract_noop, output_type, resources,
     assert pdfa_info['output'] == output_type
 
 
+def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):
+
+    # Ghostscript doesn't support high Unicode, so neither do we, to be
+    # safe
+    input_file = resources / 'c02-22.pdf'
+    high_unicode = 'U+1030C is: 𐌌'
+
+    p, out, err = run_ocrmypdf(
+        input_file, no_outpdf,
+        '--subject', high_unicode,
+        '--output-type', 'pdfa',
+        env=spoof_tesseract_noop)
+
+    assert p.returncode == ExitCode.bad_args, err
+
+
 @pytest.mark.parametrize('renderer', [
     'hocr',
     'tesseract',