From 8ccb5cd60d0d5d222e5c96baf8cc7e55052846ed Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 4 Mar 2025 11:34:19 -0600 Subject: [PATCH 1/2] feat: allow extra config with multiple output command --- tests/pytesseract_test.py | 18 ++++++++++++++++++ unstructured_pytesseract/__init__.py | 2 +- unstructured_pytesseract/pytesseract.py | 5 +++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/tests/pytesseract_test.py b/tests/pytesseract_test.py index 01d95e63..9d75654b 100644 --- a/tests/pytesseract_test.py +++ b/tests/pytesseract_test.py @@ -267,6 +267,24 @@ def test_run_and_get_multiple_output(test_file, function_mapping, extensions): assert result == function_mapping[extension](test_file) +def test_run_and_get_multiple_output_with_extra_config( + test_file, + function_mapping, +): + compound_results = run_and_get_multiple_output( + test_file, + extensions=['hocr', 'txt'], + extra_config='hocr_char_boxes=1', + ) + assert ( + compound_results[0][:1000] + == function_mapping['hocr'](test_file, config='-c hocr_char_boxes=1')[ + :1000 + ] + ) + assert compound_results[1] == function_mapping['txt'](test_file) + + @pytest.mark.skipif( TESSERACT_VERSION[:2] < (4, 1), reason='requires tesseract >= 4.1', diff --git a/unstructured_pytesseract/__init__.py b/unstructured_pytesseract/__init__.py index 40164fa9..93a5a79e 100644 --- a/unstructured_pytesseract/__init__.py +++ b/unstructured_pytesseract/__init__.py @@ -16,4 +16,4 @@ from .pytesseract import TSVNotSupported -__version__ = '0.3.13' +__version__ = '0.3.14' diff --git a/unstructured_pytesseract/pytesseract.py b/unstructured_pytesseract/pytesseract.py index 37837f4c..4c7010a9 100644 --- a/unstructured_pytesseract/pytesseract.py +++ b/unstructured_pytesseract/pytesseract.py @@ -297,15 +297,16 @@ def run_and_get_multiple_output( lang: Optional[str] = None, nice: int = 0, timeout: int = 0, + extra_config: str = '', return_bytes: bool = False, ): config = ' '.join( EXTENTION_TO_CONFIG.get(extension, '') for extension in extensions ).strip() if config: - config = f'-c {config}' + config = f'-c {config} {extra_config}' else: - config = '' + config = extra_config with save(image) as (temp_name, input_filename): kwargs = { From 3c657aaf10dbe431459fbf9e52d2bf20f4226888 Mon Sep 17 00:00:00 2001 From: Yao You Date: Tue, 4 Mar 2025 12:45:09 -0600 Subject: [PATCH 2/2] upgrade runner to 22.04 --- .github/workflows/ci.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 293ed228..a265aab7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,11 +19,11 @@ jobs: fail-fast: true matrix: include: - - {name: '3.12', python: '3.12', os: ubuntu-20.04, tox: py312} - - {name: '3.11', python: '3.11', os: ubuntu-20.04, tox: py311} - - {name: '3.10', python: '3.10', os: ubuntu-20.04, tox: py310} - - {name: '3.9', python: '3.9', os: ubuntu-20.04, tox: py39} - - {name: '3.8', python: '3.8', os: ubuntu-20.04, tox: py38} + - {name: '3.12', python: '3.12', os: ubuntu-22.04, tox: py312} + - {name: '3.11', python: '3.11', os: ubuntu-22.04, tox: py311} + - {name: '3.10', python: '3.10', os: ubuntu-22.04, tox: py310} + - {name: '3.9', python: '3.9', os: ubuntu-22.04, tox: py39} + - {name: '3.8', python: '3.8', os: ubuntu-22.04, tox: py38} steps: - uses: actions/checkout@v3