Skip to content

Commit

Permalink
Merge branch 'abwiersma-fix_random_ordering_languages'
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Jun 20, 2023
2 parents 90619b3 + 5124daa commit d94d267
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 14 deletions.
15 changes: 9 additions & 6 deletions src/ocrmypdf/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,17 @@ def check_platform() -> None:
)


def check_options_languages(options: Namespace, ocr_engine_languages: set[str]) -> None:
def check_options_languages(
options: Namespace, ocr_engine_languages: list[str]
) -> None:
if not options.languages:
options.languages = {DEFAULT_LANGUAGE}
options.languages = [DEFAULT_LANGUAGE]
system_lang = locale.getlocale()[0]
if system_lang and not system_lang.startswith('en'):
log.debug("No language specified; assuming --language %s", DEFAULT_LANGUAGE)
if not ocr_engine_languages:
return
missing_languages = options.languages - ocr_engine_languages
missing_languages = set(options.languages) - set(ocr_engine_languages)
if missing_languages:
lang_text = '\n'.join(lang for lang in missing_languages)
msg = (
Expand All @@ -71,15 +73,16 @@ def check_options_languages(options: Namespace, ocr_engine_languages: set[str])
"See the online documentation for instructions:\n"
" https://ocrmypdf.readthedocs.io/en/latest/languages.html\n"
"\n"
"Note: most languages are identified by a 3-digit ISO 639-2 Code.\n"
"For example, English is 'eng', German is 'deu', and Spanish is 'spa'."
"Note: most languages are identified by a 3-letter ISO 639-2 Code.\n"
"For example, English is 'eng', German is 'deu', and Spanish is 'spa'.\n"
"Simplified Chinese is 'chi_sim' and Traditional Chinese is 'chi_tra'."
"\n"
)
raise MissingDependencyError(msg)


def check_options_output(options: Namespace) -> None:
is_latin = options.languages.issubset(HOCR_OK_LANGS)
is_latin = set(options.languages).issubset(HOCR_OK_LANGS)

if options.pdf_renderer.startswith('hocr') and not is_latin:
log.warning(
Expand Down
6 changes: 3 additions & 3 deletions src/ocrmypdf/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,16 +84,16 @@ class LanguageSetAction(argparse.Action):
def __init__(self, option_strings, dest, default=None, **kwargs):
"""Initialize the action."""
if default is None:
default = set()
default = list()
super().__init__(option_strings, dest, default=default, **kwargs)

def __call__(self, parser, namespace, values, option_string=None):
"""Add a language to the set."""
dest = getattr(namespace, self.dest)
if '+' in values:
dest.update(lang for lang in values.split('+'))
[dest.append(lang) for lang in values.split('+')]
else:
dest.add(values)
dest.append(values)


def get_parser():
Expand Down
10 changes: 5 additions & 5 deletions tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,17 +166,17 @@ def test_language_warning(caplog):
with patch(
'ocrmypdf._validation.locale.getlocale', return_value=('en_US', 'UTF-8')
) as mock:
vd.check_options_languages(opts, {'eng'})
assert opts.languages == {'eng'}
vd.check_options_languages(opts, ['eng'])
assert opts.languages == ['eng']
assert '' in caplog.text
mock.assert_called_once()

opts = make_opts(language=None)
with patch(
'ocrmypdf._validation.locale.getlocale', return_value=('fr_FR', 'UTF-8')
) as mock:
vd.check_options_languages(opts, {'eng'})
assert opts.languages == {'eng'}
vd.check_options_languages(opts, ['eng'])
assert opts.languages == ['eng']
assert 'assuming --language' in caplog.text
mock.assert_called_once()

Expand Down Expand Up @@ -282,7 +282,7 @@ def test_two_languages():
parser=get_parser(),
language='fakelang1+fakelang2',
),
{'fakelang1', 'fakelang2'},
['fakelang1', 'fakelang2'],
)


Expand Down

0 comments on commit d94d267

Please sign in to comment.