From 4540ed4f493be7205f34afc5c22185298251d2f2 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 6 Jan 2023 16:16:17 -0600 Subject: [PATCH 1/8] ocr when cid ratio is too high --- unstructured_inference/inference/layout.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index f3e37c85..965c0752 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -1,5 +1,6 @@ from __future__ import annotations from dataclasses import dataclass +import re import tempfile from typing import List, Optional, Tuple, Union, BinaryIO @@ -111,7 +112,7 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]: for text_block in text_blocks: # NOTE(robinson) - If the text attribute is None, that means the PDF isn't # already OCR'd and we have to send the snippet out for OCRing. - if text_block.text is None: + if (text_block.text is None) or cid_ratio(text_block.text) > 0.5: text_block.text = self.ocr(text_block) text = " ".join([x for x in text_blocks.get_texts() if x]) @@ -156,3 +157,11 @@ def process_file_with_model(filename: str, model_name: str) -> DocumentLayout: model = None if model_name is None else get_model(model_name) layout = DocumentLayout.from_file(filename, model=model) return layout + + +def cid_ratio(text: str) -> float: + """Gets ratio of unknown 'cid' characters extracted from text to all characters.""" + cid_pattern = r"\(cid\:(\d+)\)" + unmatched, n_cid = re.subn(cid_pattern, "", text) + total = n_cid + len(unmatched) + return n_cid / total if total > 0 else 1.0 From 608b7fb402abbef204b357f8ee7e59e203c0ef97 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Sat, 7 Jan 2023 23:22:29 -0600 Subject: [PATCH 2/8] Separate out interpretation of text blocks --- unstructured_inference/inference/layout.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index f3e37c85..89dd8385 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -109,10 +109,7 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]: text_blocks = self.layout.filter_by(item, center=True) text = str() for text_block in text_blocks: - # NOTE(robinson) - If the text attribute is None, that means the PDF isn't - # already OCR'd and we have to send the snippet out for OCRing. - if text_block.text is None: - text_block.text = self.ocr(text_block) + text_block.text = self.interpret_text_block(text_block) text = " ".join([x for x in text_blocks.get_texts() if x]) elements.append( @@ -124,6 +121,16 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]: return None return elements + def interpret_text_block(self, text_block: lp.TextBlock) -> str: + """Interprets the text in a TextBlock.""" + # NOTE(robinson) - If the text attribute is None, that means the PDF isn't + # already OCR'd and we have to send the snippet out for OCRing. + if text_block.text is None: + out_text = self.ocr(text_block) + else: + out_text = text_block.text + return out_text + def ocr(self, text_block: lp.TextBlock) -> str: """Runs a cropped text block image through and OCR agent.""" logger.debug("Running OCR on text block ...") From 7aa6aa9aa065d803d8c5e51503cfb9cff1892aa0 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Sat, 7 Jan 2023 23:24:55 -0600 Subject: [PATCH 3/8] Test TextBlock interpretation when unknown symbols are in text --- .../inference/test_layout.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 05c68736..7135c6aa 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -176,3 +176,23 @@ def test_process_file_with_model(monkeypatch, mock_page_layout, model_name): def test_process_file_with_model_raises_on_invalid_model_name(): with pytest.raises(models.UnknownModelException): layout.process_file_with_model("", model_name="fake") + + +class MockPageLayout(layout.PageLayout): + def __init__(self, ocr_text): + self.ocr_text = ocr_text + + def ocr(self, text_block): + return self.ocr_text + + +class MockTextBlock(lp.TextBlock): + def __init__(self, text): + self.text = text + + +def test_interpret_text_block_use_ocr_when_text_symbols_cid(): + fake_text = "(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)" + fake_ocr = "ocrme" + fake_text_block = MockTextBlock(fake_text) + assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr From b34f32abfd396a042e7031c8e125417363d56c64 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Sat, 7 Jan 2023 23:31:37 -0600 Subject: [PATCH 4/8] Update version and changelog --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 78599002..ef590252 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.2-dev0 + +* Add logic to use OCR when layout text is full of unknown characters + ## 0.2.1 * Refactor to facilitate local inference diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 9aa97038..d8249901 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.1" # pragma: no cover +__version__ = "0.2.2-dev0" # pragma: no cover From d88582fecf71c012bb15590cc154d3d565c1c67a Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 9 Jan 2023 09:39:51 -0600 Subject: [PATCH 5/8] Add prechecks that are cheaper computationally --- unstructured_inference/inference/layout.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 1277c7b2..1ea01a02 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -168,7 +168,15 @@ def process_file_with_model(filename: str, model_name: str) -> DocumentLayout: def cid_ratio(text: str) -> float: """Gets ratio of unknown 'cid' characters extracted from text to all characters.""" + if not is_cid_present(text): + return 0.0 cid_pattern = r"\(cid\:(\d+)\)" unmatched, n_cid = re.subn(cid_pattern, "", text) total = n_cid + len(unmatched) return n_cid / total if total > 0 else 1.0 + + +def is_cid_present(text: str) -> bool: + if len(text) < len("(cid:x)"): + return False + return text.find("(cid:") != -1 From 6bd1f1bced7190d5a71f31039c299bfd76560c28 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 9 Jan 2023 09:40:09 -0600 Subject: [PATCH 6/8] test_cid_ratio stub --- test_unstructured_inference/inference/test_layout.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 7135c6aa..50137d9d 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -196,3 +196,7 @@ def test_interpret_text_block_use_ocr_when_text_symbols_cid(): fake_ocr = "ocrme" fake_text_block = MockTextBlock(fake_text) assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr + + +def test_cid_ratio(): + pass From cb1318e0d7c6a43a50f9b43fe1fe6db27df260ad Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 9 Jan 2023 11:55:41 -0600 Subject: [PATCH 7/8] No more need for div0 case --- unstructured_inference/inference/layout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 1ea01a02..675eb48e 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -173,7 +173,7 @@ def cid_ratio(text: str) -> float: cid_pattern = r"\(cid\:(\d+)\)" unmatched, n_cid = re.subn(cid_pattern, "", text) total = n_cid + len(unmatched) - return n_cid / total if total > 0 else 1.0 + return n_cid / total def is_cid_present(text: str) -> bool: From a130b609a14a612ac9f67ae7f911ff65e571852c Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Mon, 9 Jan 2023 11:56:32 -0600 Subject: [PATCH 8/8] Add tests for cid_ratio and is_cid_present functions --- .../inference/test_layout.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 50137d9d..fb7d390e 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -198,5 +198,17 @@ def test_interpret_text_block_use_ocr_when_text_symbols_cid(): assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr -def test_cid_ratio(): - pass +@pytest.mark.parametrize( + "text, expected", + [("base", 0.0), ("", 0.0), ("(cid:2)", 1.0), ("(cid:1)a", 0.5), ("c(cid:1)ab", 0.25)], +) +def test_cid_ratio(text, expected): + assert layout.cid_ratio(text) == expected + + +@pytest.mark.parametrize( + "text, expected", + [("base", False), ("(cid:2)", True), ("(cid:1234567890)", True), ("jkl;(cid:12)asdf", True)], +) +def test_is_cid_present(text, expected): + assert layout.is_cid_present(text) == expected