diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 293ed228..a265aab7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,11 +19,11 @@ jobs: fail-fast: true matrix: include: - - {name: '3.12', python: '3.12', os: ubuntu-20.04, tox: py312} - - {name: '3.11', python: '3.11', os: ubuntu-20.04, tox: py311} - - {name: '3.10', python: '3.10', os: ubuntu-20.04, tox: py310} - - {name: '3.9', python: '3.9', os: ubuntu-20.04, tox: py39} - - {name: '3.8', python: '3.8', os: ubuntu-20.04, tox: py38} + - {name: '3.12', python: '3.12', os: ubuntu-22.04, tox: py312} + - {name: '3.11', python: '3.11', os: ubuntu-22.04, tox: py311} + - {name: '3.10', python: '3.10', os: ubuntu-22.04, tox: py310} + - {name: '3.9', python: '3.9', os: ubuntu-22.04, tox: py39} + - {name: '3.8', python: '3.8', os: ubuntu-22.04, tox: py38} steps: - uses: actions/checkout@v3 diff --git a/tests/pytesseract_test.py b/tests/pytesseract_test.py index 01d95e63..9d75654b 100644 --- a/tests/pytesseract_test.py +++ b/tests/pytesseract_test.py @@ -267,6 +267,24 @@ def test_run_and_get_multiple_output(test_file, function_mapping, extensions): assert result == function_mapping[extension](test_file) +def test_run_and_get_multiple_output_with_extra_config( + test_file, + function_mapping, +): + compound_results = run_and_get_multiple_output( + test_file, + extensions=['hocr', 'txt'], + extra_config='hocr_char_boxes=1', + ) + assert ( + compound_results[0][:1000] + == function_mapping['hocr'](test_file, config='-c hocr_char_boxes=1')[ + :1000 + ] + ) + assert compound_results[1] == function_mapping['txt'](test_file) + + @pytest.mark.skipif( TESSERACT_VERSION[:2] < (4, 1), reason='requires tesseract >= 4.1', diff --git a/unstructured_pytesseract/__init__.py b/unstructured_pytesseract/__init__.py index 40164fa9..93a5a79e 100644 --- a/unstructured_pytesseract/__init__.py +++ b/unstructured_pytesseract/__init__.py @@ -16,4 +16,4 @@ from .pytesseract import TSVNotSupported -__version__ = '0.3.13' +__version__ = '0.3.14' diff --git a/unstructured_pytesseract/pytesseract.py b/unstructured_pytesseract/pytesseract.py index 37837f4c..4c7010a9 100644 --- a/unstructured_pytesseract/pytesseract.py +++ b/unstructured_pytesseract/pytesseract.py @@ -297,15 +297,16 @@ def run_and_get_multiple_output( lang: Optional[str] = None, nice: int = 0, timeout: int = 0, + extra_config: str = '', return_bytes: bool = False, ): config = ' '.join( EXTENTION_TO_CONFIG.get(extension, '') for extension in extensions ).strip() if config: - config = f'-c {config}' + config = f'-c {config} {extra_config}' else: - config = '' + config = extra_config with save(image) as (temp_name, input_filename): kwargs = {