diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bce9342..f43c7976 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.10 + +* Removed control characters from tesseract output + ## 0.2.9 * Removed multithreading from OCR (DocumentLayout.get_elements_from_layout) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 4aabafda..e1cd08d7 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -327,3 +327,10 @@ def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method) def test_invalid_ocr_strategy_raises(mock_image): with pytest.raises(ValueError): layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy") + + +@pytest.mark.parametrize( + ("text", "expected"), [("a\ts\x0cd\nfas\fd\rf\b", "asdfasdf"), ("\"'\\", "\"'\\")] +) +def test_remove_control_characters(text, expected): + assert layout.remove_control_characters(text) == expected diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 6385d31e..47aedffe 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.9" # pragma: no cover +__version__ = "0.2.10" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index ec4ce016..ba13eff9 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -5,7 +5,7 @@ import tempfile from tqdm import tqdm from typing import List, Optional, Tuple, Union, BinaryIO - +import unicodedata from layoutparser.io.pdf import load_pdf from layoutparser.elements.layout_elements import TextBlock from layoutparser.elements.layout import Layout @@ -318,6 +318,7 @@ def interpret_text_block( out_text = ocr(text_block, image) else: out_text = "" if text_block.text is None else text_block.text + out_text = remove_control_characters(out_text) return out_text @@ -329,3 +330,9 @@ def ocr(text_block: TextBlock, image: Image.Image) -> str: padded_block = text_block.pad(left=5, right=5, top=5, bottom=5) cropped_image = padded_block.crop_image(image_array) return tesseract.ocr_agent.detect(cropped_image) + + +def remove_control_characters(text: str) -> str: + """Removes control characters from text.""" + out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C") + return out_text