diff --git a/CHANGELOG.md b/CHANGELOG.md index ef590252..d19b2306 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.2.2-dev0 +## 0.2.2-dev1 +* Add capability to process image files * Add logic to use OCR when layout text is full of unknown characters ## 0.2.1 diff --git a/sample-docs/loremipsum.jpg b/sample-docs/loremipsum.jpg new file mode 100644 index 00000000..102a8bc1 Binary files /dev/null and b/sample-docs/loremipsum.jpg differ diff --git a/sample-docs/loremipsum.png b/sample-docs/loremipsum.png new file mode 100644 index 00000000..977349cc Binary files /dev/null and b/sample-docs/loremipsum.png differ diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index fb7d390e..aefbb1d0 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -178,24 +178,37 @@ def test_process_file_with_model_raises_on_invalid_model_name(): layout.process_file_with_model("", model_name="fake") -class MockPageLayout(layout.PageLayout): - def __init__(self, ocr_text): - self.ocr_text = ocr_text - - def ocr(self, text_block): - return self.ocr_text +class MockPoints: + def tolist(self): + return [1, 2, 3, 4] class MockTextBlock(lp.TextBlock): - def __init__(self, text): + def __init__(self, type=None, text=None, ocr_text=None): + self.type = type self.text = text + self.ocr_text = ocr_text + + @property + def points(self): + return MockPoints() + + +class MockPageLayout(layout.PageLayout): + def __init__(self, layout=None, model=None): + self.image = None + self.layout = layout + self.model = model + + def ocr(self, text_block: MockTextBlock): + return text_block.ocr_text def test_interpret_text_block_use_ocr_when_text_symbols_cid(): fake_text = "(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)" fake_ocr = "ocrme" - fake_text_block = MockTextBlock(fake_text) - assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr + fake_text_block = MockTextBlock(text=fake_text, ocr_text=fake_ocr) + assert MockPageLayout().interpret_text_block(fake_text_block) == fake_ocr @pytest.mark.parametrize( @@ -212,3 +225,45 @@ def test_cid_ratio(text, expected): ) def test_is_cid_present(text, expected): assert layout.is_cid_present(text) == expected + + +class MockLayout: + def __init__(self, *elements): + self.elements = elements + + def sort(self, key, inplace): + return self.elements + + def __iter__(self): + return iter(self.elements) + + def get_texts(self): + return [el.text for el in self.elements] + + +def test_pagelayout_without_layout(): + mock_layout = MockLayout( + MockTextBlock(text=None, ocr_text="textblock1"), + MockTextBlock(text=None, ocr_text="textblock2"), + ) + + model = MockLayoutModel(mock_layout) + pl = MockPageLayout(model=model, layout=None) + + assert [el.text for el in pl.get_elements(inplace=False)] == [ + el.ocr_text for el in model.detect(None) + ] + + +@pytest.mark.parametrize("filetype", ("png", "jpg")) +def test_from_image_file(monkeypatch, mock_page_layout, filetype): + def mock_get_elements(self, *args, **kwargs): + self.elements = [mock_page_layout] + + monkeypatch.setattr(layout.PageLayout, "get_elements", mock_get_elements) + elements = ( + layout.DocumentLayout.from_image_file(f"sample-docs/loremipsum.{filetype}") + .pages[0] + .elements + ) + assert elements[0] == mock_page_layout diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index d8249901..b3571ac5 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.2-dev0" # pragma: no cover +__version__ = "0.2.2-dev1" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 675eb48e..cf7394a6 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -72,6 +72,15 @@ def from_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = None) pages.append(page) return cls.from_pages(pages) + @classmethod + def from_image_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = None): + """Creates a DocumentLayout from an image file.""" + logger.info(f"Reading image file: {filename} ...") + image = Image.open(filename) + page = PageLayout(number=0, image=image, layout=None, model=model) + page.get_elements() + return cls.from_pages([page]) + class PageLayout: """Class for an individual PDF page.""" @@ -80,7 +89,7 @@ def __init__( self, number: int, image: Image, - layout: lp.Layout, + layout: Optional[lp.Layout], model: Optional[Detectron2LayoutModel] = None, ): self.image = image @@ -107,12 +116,14 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]: # sophisticated ordering logic for more complicated layouts. image_layout.sort(key=lambda element: element.coordinates[1], inplace=True) for item in image_layout: - text_blocks = self.layout.filter_by(item, center=True) text = str() - for text_block in text_blocks: - text_block.text = self.interpret_text_block(text_block) - text = " ".join([x for x in text_blocks.get_texts() if x]) - + if self.layout is None: + text = self.interpret_text_block(item) + else: + text_blocks = self.layout.filter_by(item, center=True) + for text_block in text_blocks: + text_block.text = self.interpret_text_block(text_block) + text = " ".join([x for x in text_blocks.get_texts() if x]) elements.append( LayoutElement(type=item.type, text=text, coordinates=item.points.tolist()) )