From 4540ed4f493be7205f34afc5c22185298251d2f2 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Fri, 6 Jan 2023 16:16:17 -0600
Subject: [PATCH 1/8] ocr when cid ratio is too high

---
 unstructured_inference/inference/layout.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index f3e37c85..965c0752 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from dataclasses import dataclass
+import re
 import tempfile
 from typing import List, Optional, Tuple, Union, BinaryIO
 
@@ -111,7 +112,7 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]:
             for text_block in text_blocks:
                 # NOTE(robinson) - If the text attribute is None, that means the PDF isn't
                 # already OCR'd and we have to send the snippet out for OCRing.
-                if text_block.text is None:
+                if (text_block.text is None) or cid_ratio(text_block.text) > 0.5:
                     text_block.text = self.ocr(text_block)
             text = " ".join([x for x in text_blocks.get_texts() if x])
 
@@ -156,3 +157,11 @@ def process_file_with_model(filename: str, model_name: str) -> DocumentLayout:
     model = None if model_name is None else get_model(model_name)
     layout = DocumentLayout.from_file(filename, model=model)
     return layout
+
+
+def cid_ratio(text: str) -> float:
+    """Gets ratio of unknown 'cid' characters extracted from text to all characters."""
+    cid_pattern = r"\(cid\:(\d+)\)"
+    unmatched, n_cid = re.subn(cid_pattern, "", text)
+    total = n_cid + len(unmatched)
+    return n_cid / total if total > 0 else 1.0

From 608b7fb402abbef204b357f8ee7e59e203c0ef97 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Sat, 7 Jan 2023 23:22:29 -0600
Subject: [PATCH 2/8] Separate out interpretation of text blocks

---
 unstructured_inference/inference/layout.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index f3e37c85..89dd8385 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -109,10 +109,7 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]:
             text_blocks = self.layout.filter_by(item, center=True)
             text = str()
             for text_block in text_blocks:
-                # NOTE(robinson) - If the text attribute is None, that means the PDF isn't
-                # already OCR'd and we have to send the snippet out for OCRing.
-                if text_block.text is None:
-                    text_block.text = self.ocr(text_block)
+                text_block.text = self.interpret_text_block(text_block)
             text = " ".join([x for x in text_blocks.get_texts() if x])
 
             elements.append(
@@ -124,6 +121,16 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]:
             return None
         return elements
 
+    def interpret_text_block(self, text_block: lp.TextBlock) -> str:
+        """Interprets the text in a TextBlock."""
+        # NOTE(robinson) - If the text attribute is None, that means the PDF isn't
+        # already OCR'd and we have to send the snippet out for OCRing.
+        if text_block.text is None:
+            out_text = self.ocr(text_block)
+        else:
+            out_text = text_block.text
+        return out_text
+
     def ocr(self, text_block: lp.TextBlock) -> str:
         """Runs a cropped text block image through and OCR agent."""
         logger.debug("Running OCR on text block ...")

From 7aa6aa9aa065d803d8c5e51503cfb9cff1892aa0 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Sat, 7 Jan 2023 23:24:55 -0600
Subject: [PATCH 3/8] Test TextBlock interpretation when unknown symbols are in
 text

---
 .../inference/test_layout.py                  | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
index 05c68736..7135c6aa 100644
--- a/test_unstructured_inference/inference/test_layout.py
+++ b/test_unstructured_inference/inference/test_layout.py
@@ -176,3 +176,23 @@ def test_process_file_with_model(monkeypatch, mock_page_layout, model_name):
 def test_process_file_with_model_raises_on_invalid_model_name():
     with pytest.raises(models.UnknownModelException):
         layout.process_file_with_model("", model_name="fake")
+
+
+class MockPageLayout(layout.PageLayout):
+    def __init__(self, ocr_text):
+        self.ocr_text = ocr_text
+
+    def ocr(self, text_block):
+        return self.ocr_text
+
+
+class MockTextBlock(lp.TextBlock):
+    def __init__(self, text):
+        self.text = text
+
+
+def test_interpret_text_block_use_ocr_when_text_symbols_cid():
+    fake_text = "(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)"
+    fake_ocr = "ocrme"
+    fake_text_block = MockTextBlock(fake_text)
+    assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr

From b34f32abfd396a042e7031c8e125417363d56c64 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Sat, 7 Jan 2023 23:31:37 -0600
Subject: [PATCH 4/8] Update version and changelog

---
 CHANGELOG.md                          | 4 ++++
 unstructured_inference/__version__.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 78599002..ef590252 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.2.2-dev0
+
+* Add logic to use OCR when layout text is full of unknown characters
+
 ## 0.2.1
 
 * Refactor to facilitate local inference
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 9aa97038..d8249901 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.1"  # pragma: no cover
+__version__ = "0.2.2-dev0"  # pragma: no cover

From d88582fecf71c012bb15590cc154d3d565c1c67a Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Mon, 9 Jan 2023 09:39:51 -0600
Subject: [PATCH 5/8] Add prechecks that are cheaper computationally

---
 unstructured_inference/inference/layout.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index 1277c7b2..1ea01a02 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -168,7 +168,15 @@ def process_file_with_model(filename: str, model_name: str) -> DocumentLayout:
 
 def cid_ratio(text: str) -> float:
     """Gets ratio of unknown 'cid' characters extracted from text to all characters."""
+    if not is_cid_present(text):
+        return 0.0
     cid_pattern = r"\(cid\:(\d+)\)"
     unmatched, n_cid = re.subn(cid_pattern, "", text)
     total = n_cid + len(unmatched)
     return n_cid / total if total > 0 else 1.0
+
+
+def is_cid_present(text: str) -> bool:
+    if len(text) < len("(cid:x)"):
+        return False
+    return text.find("(cid:") != -1

From 6bd1f1bced7190d5a71f31039c299bfd76560c28 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Mon, 9 Jan 2023 09:40:09 -0600
Subject: [PATCH 6/8] test_cid_ratio stub

---
 test_unstructured_inference/inference/test_layout.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
index 7135c6aa..50137d9d 100644
--- a/test_unstructured_inference/inference/test_layout.py
+++ b/test_unstructured_inference/inference/test_layout.py
@@ -196,3 +196,7 @@ def test_interpret_text_block_use_ocr_when_text_symbols_cid():
     fake_ocr = "ocrme"
     fake_text_block = MockTextBlock(fake_text)
     assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr
+
+
+def test_cid_ratio():
+    pass

From cb1318e0d7c6a43a50f9b43fe1fe6db27df260ad Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Mon, 9 Jan 2023 11:55:41 -0600
Subject: [PATCH 7/8] No more need for div0 case

---
 unstructured_inference/inference/layout.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index 1ea01a02..675eb48e 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -173,7 +173,7 @@ def cid_ratio(text: str) -> float:
     cid_pattern = r"\(cid\:(\d+)\)"
     unmatched, n_cid = re.subn(cid_pattern, "", text)
     total = n_cid + len(unmatched)
-    return n_cid / total if total > 0 else 1.0
+    return n_cid / total
 
 
 def is_cid_present(text: str) -> bool:

From a130b609a14a612ac9f67ae7f911ff65e571852c Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Mon, 9 Jan 2023 11:56:32 -0600
Subject: [PATCH 8/8] Add tests for cid_ratio and is_cid_present functions

---
 .../inference/test_layout.py                     | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
index 50137d9d..fb7d390e 100644
--- a/test_unstructured_inference/inference/test_layout.py
+++ b/test_unstructured_inference/inference/test_layout.py
@@ -198,5 +198,17 @@ def test_interpret_text_block_use_ocr_when_text_symbols_cid():
     assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr
 
 
-def test_cid_ratio():
-    pass
+@pytest.mark.parametrize(
+    "text, expected",
+    [("base", 0.0), ("", 0.0), ("(cid:2)", 1.0), ("(cid:1)a", 0.5), ("c(cid:1)ab", 0.25)],
+)
+def test_cid_ratio(text, expected):
+    assert layout.cid_ratio(text) == expected
+
+
+@pytest.mark.parametrize(
+    "text, expected",
+    [("base", False), ("(cid:2)", True), ("(cid:1234567890)", True), ("jkl;(cid:12)asdf", True)],
+)
+def test_is_cid_present(text, expected):
+    assert layout.is_cid_present(text) == expected