Skip to content

Commit

Permalink
Reject high Unicode metadata at command line
Browse files Browse the repository at this point in the history
Ghostscript 9.21 does not seem to accept Unicode above U+FFFF. Previous
versions did, but it now exits with a rangecheck error (-15).

Reject on the command line for now. Complete fix would also need to
check input PDF’s metadata.
  • Loading branch information
James R. Barlow committed Mar 28, 2017
1 parent e71e8ca commit 88ef271
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 1 deletion.
19 changes: 18 additions & 1 deletion ocrmypdf/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def complain(message):
MINIMUM_TESS_VERSION, tesseract.version()))
sys.exit(ExitCode.missing_dependency)


# -------------
# Parser

Expand Down Expand Up @@ -344,13 +343,31 @@ def check_options_advanced(options, log):
"commit 3d9fb3b or later")


def check_options_metadata(options, log):
import unicodedata
metadata = [options.title, options.author, options.keywords,
options.subject]
for s in (m for m in metadata if m):
for c in s:
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
raise ValueError(
"One of the metadata strings contains "
"an unsupported Unicode character: '{}' (U+{})".format(
c, hex(ord(c))[2:].upper()
))


def check_options(options, log):
try:
check_options_languages(options, log)
check_options_metadata(options, log)
check_options_output(options, log)
check_options_preprocessing(options, log)
check_options_ocr_behavior(options, log)
check_options_advanced(options, log)
except ValueError as e:
log.error(e)
sys.exit(ExitCode.bad_args)
except argparse.ArgumentError as e:
log.error(e)
sys.exit(ExitCode.bad_args)
Expand Down
16 changes: 16 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,22 @@ def test_override_metadata(spoof_tesseract_noop, output_type, resources,
assert pdfa_info['output'] == output_type


def test_high_unicode(spoof_tesseract_noop, resources, no_outpdf):

# Ghostscript doesn't support high Unicode, so neither do we, to be
# safe
input_file = resources / 'c02-22.pdf'
high_unicode = 'U+1030C is: 𐌌'

p, out, err = run_ocrmypdf(
input_file, no_outpdf,
'--subject', high_unicode,
'--output-type', 'pdfa',
env=spoof_tesseract_noop)

assert p.returncode == ExitCode.bad_args, err


@pytest.mark.parametrize('renderer', [
'hocr',
'tesseract',
Expand Down

0 comments on commit 88ef271

Please sign in to comment.