Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.2.10

* Removed control characters from tesseract output

## 0.2.9

* Removed multithreading from OCR (DocumentLayout.get_elements_from_layout)
Expand Down
7 changes: 7 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,10 @@ def test_from_file_fixed_layout(fixed_layouts, called_method, not_called_method)
def test_invalid_ocr_strategy_raises(mock_image):
with pytest.raises(ValueError):
layout.PageLayout(0, mock_image, MockLayout(), ocr_strategy="fake_strategy")


@pytest.mark.parametrize(
("text", "expected"), [("a\ts\x0cd\nfas\fd\rf\b", "asdfasdf"), ("\"'\\", "\"'\\")]
)
def test_remove_control_characters(text, expected):
assert layout.remove_control_characters(text) == expected
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.9" # pragma: no cover
__version__ = "0.2.10" # pragma: no cover
9 changes: 8 additions & 1 deletion unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import tempfile
from tqdm import tqdm
from typing import List, Optional, Tuple, Union, BinaryIO

import unicodedata
from layoutparser.io.pdf import load_pdf
from layoutparser.elements.layout_elements import TextBlock
from layoutparser.elements.layout import Layout
Expand Down Expand Up @@ -318,6 +318,7 @@ def interpret_text_block(
out_text = ocr(text_block, image)
else:
out_text = "" if text_block.text is None else text_block.text
out_text = remove_control_characters(out_text)
return out_text


Expand All @@ -329,3 +330,9 @@ def ocr(text_block: TextBlock, image: Image.Image) -> str:
padded_block = text_block.pad(left=5, right=5, top=5, bottom=5)
cropped_image = padded_block.crop_image(image_array)
return tesseract.ocr_agent.detect(cropped_image)


def remove_control_characters(text: str) -> str:
"""Removes control characters from text."""
out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
return out_text