Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.2.2-dev1
## 0.2.2

* Add capability to process image files
* Add logic to use OCR when layout text is full of unknown characters
Expand Down
34 changes: 18 additions & 16 deletions test_unstructured_inference/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,50 +9,52 @@
import unstructured_inference.models.detectron2 as detectron2


@pytest.fixture
def sample_pdf_content():
return """
this is the content of a sample pdf file.
Title: ...
Author: ...
"""


class MockModel:
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs


def test_layout_parsing_pdf_api(sample_pdf_content, tmpdir, monkeypatch):
@pytest.mark.parametrize("filetype, ext", [("pdf", "pdf"), ("image", "png")])
def test_layout_parsing_api(monkeypatch, filetype, ext):
monkeypatch.setattr(models, "load_model", lambda *args, **kwargs: MockModel(*args, **kwargs))
monkeypatch.setattr(models, "hf_hub_download", lambda *args, **kwargs: "fake-path")
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
monkeypatch.setattr(
DocumentLayout, "from_file", lambda *args, **kwargs: DocumentLayout.from_pages([])
)
monkeypatch.setattr(
DocumentLayout, "from_image_file", lambda *args, **kwargs: DocumentLayout.from_pages([])
)

filename = os.path.join(tmpdir.dirname, "sample.pdf")
with open(filename, "w") as f:
f.write(sample_pdf_content)
filename = os.path.join("sample-docs", f"loremipsum.{ext}")

client = TestClient(app)
response = client.post("/layout/pdf", files={"file": (filename, open(filename, "rb"))})
response = client.post(f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))})
assert response.status_code == 200

response = client.post(
"/layout/pdf", files={"file": (filename, open(filename, "rb"))}, data={"model": "checkbox"}
f"/layout/{filetype}",
files={"file": (filename, open(filename, "rb"))},
data={"model": "checkbox"},
)
assert response.status_code == 200

response = client.post(
"/layout/pdf",
f"/layout/{filetype}",
files={"file": (filename, open(filename, "rb"))},
data={"model": "fake_model"},
)
assert response.status_code == 422


def test_bad_route_404():
client = TestClient(app)
filename = os.path.join("sample-docs", "loremipsum.pdf")
response = client.post("/layout/badroute", files={"file": (filename, open(filename, "rb"))})
assert response.status_code == 404


def test_healthcheck(monkeypatch):
client = TestClient(app)
response = client.get("/healthcheck")
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.2-dev1" # pragma: no cover
__version__ = "0.2.2" # pragma: no cover
11 changes: 8 additions & 3 deletions unstructured_inference/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,21 @@
app = FastAPI()

ALL_ELEMS = "_ALL"
VALID_FILETYPES = ["pdf", "image"]


@app.post("/layout/pdf")
async def layout_parsing_pdf(
@app.post("/layout/{filetype:path}")
async def layout_parsing(
filetype: str,
file: UploadFile = File(),
include_elems: List[str] = Form(default=ALL_ELEMS),
model: str = Form(default=None),
):
if filetype not in VALID_FILETYPES:
raise HTTPException(status.HTTP_404_NOT_FOUND)
is_image = filetype == "image"
try:
layout = process_data_with_model(file.file, model)
layout = process_data_with_model(file.file, model, is_image)
except UnknownModelException as e:
raise HTTPException(status.HTTP_422_UNPROCESSABLE_ENTITY, str(e))
pages_layout = [
Expand Down
17 changes: 13 additions & 4 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,21 +159,29 @@ def _get_image_array(self) -> Union[np.ndarray, None]:
return self.image_array


def process_data_with_model(data: BinaryIO, model_name: str) -> DocumentLayout:
def process_data_with_model(
data: BinaryIO, model_name: Optional[str], is_image: bool = False
) -> DocumentLayout:
"""Processes pdf file in the form of a file handler (supporting a read method) into a
DocumentLayout by using a model identified by model_name."""
with tempfile.NamedTemporaryFile() as tmp_file:
tmp_file.write(data.read())
layout = process_file_with_model(tmp_file.name, model_name)
layout = process_file_with_model(tmp_file.name, model_name, is_image=is_image)

return layout


def process_file_with_model(filename: str, model_name: str) -> DocumentLayout:
def process_file_with_model(
filename: str, model_name: Optional[str], is_image: bool = False
) -> DocumentLayout:
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
model_name."""
model = None if model_name is None else get_model(model_name)
layout = DocumentLayout.from_file(filename, model=model)
layout = (
DocumentLayout.from_image_file(filename, model=model)
if is_image
else DocumentLayout.from_file(filename, model=model)
)
return layout


Expand All @@ -188,6 +196,7 @@ def cid_ratio(text: str) -> float:


def is_cid_present(text: str) -> bool:
"""Checks if a cid code is present in a text selection."""
if len(text) < len("(cid:x)"):
return False
return text.find("(cid:") != -1