diff --git a/CHANGELOG.md b/CHANGELOG.md index d19b2306..a05c137f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.2-dev1 +## 0.2.2 * Add capability to process image files * Add logic to use OCR when layout text is full of unknown characters diff --git a/test_unstructured_inference/test_api.py b/test_unstructured_inference/test_api.py index 9680ca4f..fb33d5e7 100644 --- a/test_unstructured_inference/test_api.py +++ b/test_unstructured_inference/test_api.py @@ -9,50 +9,52 @@ import unstructured_inference.models.detectron2 as detectron2 -@pytest.fixture -def sample_pdf_content(): - return """ - this is the content of a sample pdf file. - Title: ... - Author: ... - """ - - class MockModel: def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs -def test_layout_parsing_pdf_api(sample_pdf_content, tmpdir, monkeypatch): +@pytest.mark.parametrize("filetype, ext", [("pdf", "pdf"), ("image", "png")]) +def test_layout_parsing_api(monkeypatch, filetype, ext): monkeypatch.setattr(models, "load_model", lambda *args, **kwargs: MockModel(*args, **kwargs)) monkeypatch.setattr(models, "hf_hub_download", lambda *args, **kwargs: "fake-path") monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) monkeypatch.setattr( DocumentLayout, "from_file", lambda *args, **kwargs: DocumentLayout.from_pages([]) ) + monkeypatch.setattr( + DocumentLayout, "from_image_file", lambda *args, **kwargs: DocumentLayout.from_pages([]) + ) - filename = os.path.join(tmpdir.dirname, "sample.pdf") - with open(filename, "w") as f: - f.write(sample_pdf_content) + filename = os.path.join("sample-docs", f"loremipsum.{ext}") client = TestClient(app) - response = client.post("/layout/pdf", files={"file": (filename, open(filename, "rb"))}) + response = client.post(f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))}) assert response.status_code == 200 response = client.post( - "/layout/pdf", files={"file": (filename, open(filename, "rb"))}, data={"model": "checkbox"} + f"/layout/{filetype}", + files={"file": (filename, open(filename, "rb"))}, + data={"model": "checkbox"}, ) assert response.status_code == 200 response = client.post( - "/layout/pdf", + f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))}, data={"model": "fake_model"}, ) assert response.status_code == 422 +def test_bad_route_404(): + client = TestClient(app) + filename = os.path.join("sample-docs", "loremipsum.pdf") + response = client.post("/layout/badroute", files={"file": (filename, open(filename, "rb"))}) + assert response.status_code == 404 + + def test_healthcheck(monkeypatch): client = TestClient(app) response = client.get("/healthcheck") diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index b3571ac5..8848c493 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.2-dev1" # pragma: no cover +__version__ = "0.2.2" # pragma: no cover diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py index fc237d84..628d2619 100644 --- a/unstructured_inference/api.py +++ b/unstructured_inference/api.py @@ -6,16 +6,21 @@ app = FastAPI() ALL_ELEMS = "_ALL" +VALID_FILETYPES = ["pdf", "image"] -@app.post("/layout/pdf") -async def layout_parsing_pdf( +@app.post("/layout/{filetype:path}") +async def layout_parsing( + filetype: str, file: UploadFile = File(), include_elems: List[str] = Form(default=ALL_ELEMS), model: str = Form(default=None), ): + if filetype not in VALID_FILETYPES: + raise HTTPException(status.HTTP_404_NOT_FOUND) + is_image = filetype == "image" try: - layout = process_data_with_model(file.file, model) + layout = process_data_with_model(file.file, model, is_image) except UnknownModelException as e: raise HTTPException(status.HTTP_422_UNPROCESSABLE_ENTITY, str(e)) pages_layout = [ diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index cf7394a6..9a8bff91 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -159,21 +159,29 @@ def _get_image_array(self) -> Union[np.ndarray, None]: return self.image_array -def process_data_with_model(data: BinaryIO, model_name: str) -> DocumentLayout: +def process_data_with_model( + data: BinaryIO, model_name: Optional[str], is_image: bool = False +) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a DocumentLayout by using a model identified by model_name.""" with tempfile.NamedTemporaryFile() as tmp_file: tmp_file.write(data.read()) - layout = process_file_with_model(tmp_file.name, model_name) + layout = process_file_with_model(tmp_file.name, model_name, is_image=is_image) return layout -def process_file_with_model(filename: str, model_name: str) -> DocumentLayout: +def process_file_with_model( + filename: str, model_name: Optional[str], is_image: bool = False +) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by model_name.""" model = None if model_name is None else get_model(model_name) - layout = DocumentLayout.from_file(filename, model=model) + layout = ( + DocumentLayout.from_image_file(filename, model=model) + if is_image + else DocumentLayout.from_file(filename, model=model) + ) return layout @@ -188,6 +196,7 @@ def cid_ratio(text: str) -> float: def is_cid_present(text: str) -> bool: + """Checks if a cid code is present in a text selection.""" if len(text) < len("(cid:x)"): return False return text.find("(cid:") != -1