Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.2.2-dev0
## 0.2.2-dev1

* Add capability to process image files
* Add logic to use OCR when layout text is full of unknown characters

## 0.2.1
Expand Down
Binary file added sample-docs/loremipsum.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added sample-docs/loremipsum.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
73 changes: 64 additions & 9 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,24 +178,37 @@ def test_process_file_with_model_raises_on_invalid_model_name():
layout.process_file_with_model("", model_name="fake")


class MockPageLayout(layout.PageLayout):
def __init__(self, ocr_text):
self.ocr_text = ocr_text

def ocr(self, text_block):
return self.ocr_text
class MockPoints:
def tolist(self):
return [1, 2, 3, 4]


class MockTextBlock(lp.TextBlock):
def __init__(self, text):
def __init__(self, type=None, text=None, ocr_text=None):
self.type = type
self.text = text
self.ocr_text = ocr_text

@property
def points(self):
return MockPoints()


class MockPageLayout(layout.PageLayout):
def __init__(self, layout=None, model=None):
self.image = None
self.layout = layout
self.model = model

def ocr(self, text_block: MockTextBlock):
return text_block.ocr_text


def test_interpret_text_block_use_ocr_when_text_symbols_cid():
fake_text = "(cid:1)(cid:2)(cid:3)(cid:4)(cid:5)"
fake_ocr = "ocrme"
fake_text_block = MockTextBlock(fake_text)
assert MockPageLayout(fake_ocr).interpret_text_block(fake_text_block) == fake_ocr
fake_text_block = MockTextBlock(text=fake_text, ocr_text=fake_ocr)
assert MockPageLayout().interpret_text_block(fake_text_block) == fake_ocr


@pytest.mark.parametrize(
Expand All @@ -212,3 +225,45 @@ def test_cid_ratio(text, expected):
)
def test_is_cid_present(text, expected):
assert layout.is_cid_present(text) == expected


class MockLayout:
def __init__(self, *elements):
self.elements = elements

def sort(self, key, inplace):
return self.elements

def __iter__(self):
return iter(self.elements)

def get_texts(self):
return [el.text for el in self.elements]


def test_pagelayout_without_layout():
mock_layout = MockLayout(
MockTextBlock(text=None, ocr_text="textblock1"),
MockTextBlock(text=None, ocr_text="textblock2"),
)

model = MockLayoutModel(mock_layout)
pl = MockPageLayout(model=model, layout=None)

assert [el.text for el in pl.get_elements(inplace=False)] == [
el.ocr_text for el in model.detect(None)
]


@pytest.mark.parametrize("filetype", ("png", "jpg"))
def test_from_image_file(monkeypatch, mock_page_layout, filetype):
def mock_get_elements(self, *args, **kwargs):
self.elements = [mock_page_layout]

monkeypatch.setattr(layout.PageLayout, "get_elements", mock_get_elements)
elements = (
layout.DocumentLayout.from_image_file(f"sample-docs/loremipsum.{filetype}")
.pages[0]
.elements
)
assert elements[0] == mock_page_layout
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.2-dev0" # pragma: no cover
__version__ = "0.2.2-dev1" # pragma: no cover
23 changes: 17 additions & 6 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ def from_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = None)
pages.append(page)
return cls.from_pages(pages)

@classmethod
def from_image_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = None):
"""Creates a DocumentLayout from an image file."""
logger.info(f"Reading image file: {filename} ...")
image = Image.open(filename)
page = PageLayout(number=0, image=image, layout=None, model=model)
page.get_elements()
return cls.from_pages([page])


class PageLayout:
"""Class for an individual PDF page."""
Expand All @@ -80,7 +89,7 @@ def __init__(
self,
number: int,
image: Image,
layout: lp.Layout,
layout: Optional[lp.Layout],
model: Optional[Detectron2LayoutModel] = None,
):
self.image = image
Expand All @@ -107,12 +116,14 @@ def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]:
# sophisticated ordering logic for more complicated layouts.
image_layout.sort(key=lambda element: element.coordinates[1], inplace=True)
for item in image_layout:
text_blocks = self.layout.filter_by(item, center=True)
text = str()
for text_block in text_blocks:
text_block.text = self.interpret_text_block(text_block)
text = " ".join([x for x in text_blocks.get_texts() if x])

if self.layout is None:
text = self.interpret_text_block(item)
else:
text_blocks = self.layout.filter_by(item, center=True)
for text_block in text_blocks:
text_block.text = self.interpret_text_block(text_block)
text = " ".join([x for x in text_blocks.get_texts() if x])
elements.append(
LayoutElement(type=item.type, text=text, coordinates=item.points.tolist())
)
Expand Down