From d78e0f5b55b0b620dc27554fce374fc718950b06 Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Mon, 6 Mar 2023 16:48:19 -0600 Subject: [PATCH 1/4] fix: removing control characters Tesseract is putting some control characters in out_text, this commit just delete all of them --- unstructured_inference/inference/layout.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index ec4ce016..26972688 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -5,7 +5,7 @@ import tempfile from tqdm import tqdm from typing import List, Optional, Tuple, Union, BinaryIO - +import unicodedata from layoutparser.io.pdf import load_pdf from layoutparser.elements.layout_elements import TextBlock from layoutparser.elements.layout import Layout @@ -316,6 +316,7 @@ def interpret_text_block( ocr_strategy == "auto" and ((text_block.text is None) or cid_ratio(text_block.text) > 0.5) ): out_text = ocr(text_block, image) + out_text = ''.join(c for c in out_text if unicodedata.category(c)[0] != 'C') else: out_text = "" if text_block.text is None else text_block.text return out_text From b0e9e450f01ab00e631683b7e7596ac3ebfdc28c Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Mon, 6 Mar 2023 16:52:37 -0600 Subject: [PATCH 2/4] Style correction --- unstructured_inference/inference/layout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 26972688..de615890 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -316,7 +316,7 @@ def interpret_text_block( ocr_strategy == "auto" and ((text_block.text is None) or cid_ratio(text_block.text) > 0.5) ): out_text = ocr(text_block, image) - out_text = ''.join(c for c in out_text if unicodedata.category(c)[0] != 'C') + out_text = "".join(c for c in out_text if unicodedata.category(c)[0] != "C") else: out_text = "" if text_block.text is None else text_block.text return out_text From 76cde53466f4803d57bf6de804b4f3441824d106 Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Mon, 6 Mar 2023 16:52:51 -0600 Subject: [PATCH 3/4] Version sync --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bce9342..f43c7976 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.10 + +* Removed control characters from tesseract output + ## 0.2.9 * Removed multithreading from OCR (DocumentLayout.get_elements_from_layout) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 6385d31e..47aedffe 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.9" # pragma: no cover +__version__ = "0.2.10" # pragma: no cover From a1297250b56b8cfbcabf67522b69387ea558bbb3 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Tue, 7 Mar 2023 12:55:42 -0600 Subject: [PATCH 4/4] Add tests --- test_unstructured_inference/inference/test_layout.py | 7 +++++++ unstructured_inference/inference/layout.py | 8 +++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 4aabafda..e1cd08d7 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -327,3 +327,10 @@ def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method) def test_invalid_ocr_strategy_raises(mock_image): with pytest.raises(ValueError): layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy") + + +@pytest.mark.parametrize( + ("text", "expected"), [("a\ts\x0cd\nfas\fd\rf\b", "asdfasdf"), ("\"'\\", "\"'\\")] +) +def test_remove_control_characters(text, expected): + assert layout.remove_control_characters(text) == expected diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index de615890..ba13eff9 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -316,9 +316,9 @@ def interpret_text_block( ocr_strategy == "auto" and ((text_block.text is None) or cid_ratio(text_block.text) > 0.5) ): out_text = ocr(text_block, image) - out_text = "".join(c for c in out_text if unicodedata.category(c)[0] != "C") else: out_text = "" if text_block.text is None else text_block.text + out_text = remove_control_characters(out_text) return out_text @@ -330,3 +330,9 @@ def ocr(text_block: TextBlock, image: Image.Image) -> str: padded_block = text_block.pad(left=5, right=5, top=5, bottom=5) cropped_image = padded_block.crop_image(image_array) return tesseract.ocr_agent.detect(cropped_image) + + +def remove_control_characters(text: str) -> str: + """Removes control characters from text.""" + out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C") + return out_text