From bcdd7c7229ae041baaf5e46a3cd355a15a7b1f57 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 09:07:03 -0600 Subject: [PATCH 01/13] Update version for release --- CHANGELOG.md | 2 +- unstructured_inference/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d19b2306..a05c137f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.2-dev1 +## 0.2.2 * Add capability to process image files * Add logic to use OCR when layout text is full of unknown characters diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index b3571ac5..8848c493 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.2-dev1" # pragma: no cover +__version__ = "0.2.2" # pragma: no cover From 649bc24d903fc49ca24190a6c60b13885d4c577c Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 09:44:42 -0600 Subject: [PATCH 02/13] Add processing steps for images --- unstructured_inference/inference/layout.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index cf7394a6..2e1b6a25 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -159,21 +159,29 @@ def _get_image_array(self) -> Union[np.ndarray, None]: return self.image_array -def process_data_with_model(data: BinaryIO, model_name: str) -> DocumentLayout: +def process_data_with_model( + data: BinaryIO, model_name: str, is_image: bool = False +) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a DocumentLayout by using a model identified by model_name.""" with tempfile.NamedTemporaryFile() as tmp_file: tmp_file.write(data.read()) - layout = process_file_with_model(tmp_file.name, model_name) + layout = process_file_with_model(tmp_file.name, model_name, is_image=is_image) return layout -def process_file_with_model(filename: str, model_name: str) -> DocumentLayout: +def process_file_with_model( + filename: str, model_name: str, is_image: bool = False +) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by model_name.""" model = None if model_name is None else get_model(model_name) - layout = DocumentLayout.from_file(filename, model=model) + layout = ( + DocumentLayout.from_image_file(filename, model=model) + if is_image + else DocumentLayout.from_file(filename, model=model) + ) return layout From 0004eb9a99dce1ea35a97654722f2a45e4897b88 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 09:44:58 -0600 Subject: [PATCH 03/13] Add api route for images --- unstructured_inference/api.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py index fc237d84..450895a7 100644 --- a/unstructured_inference/api.py +++ b/unstructured_inference/api.py @@ -1,7 +1,7 @@ from fastapi import FastAPI, File, status, Request, UploadFile, Form, HTTPException -from unstructured_inference.inference.layout import process_data_with_model +from unstructured_inference.inference.layout import process_data_with_model, DocumentLayout from unstructured_inference.models import UnknownModelException -from typing import List +from typing import List, BinaryIO, Optional, Union app = FastAPI() @@ -13,9 +13,27 @@ async def layout_parsing_pdf( file: UploadFile = File(), include_elems: List[str] = Form(default=ALL_ELEMS), model: str = Form(default=None), +): + return get_pages_layout(file.file, model, include_elems) + + +@app.post("/layout/image") +async def layout_parsing_image( + file: UploadFile = File(), + include_elems: List[str] = Form(default=ALL_ELEMS), + model: str = Form(default=None), +): + return get_pages_layout(file.file, model, include_elems, is_image=True) + + +def get_pages_layout( + file: BinaryIO, + model: Optional[str], + include_elems: Union[List[str], str] = ALL_ELEMS, + is_image=False, ): try: - layout = process_data_with_model(file.file, model) + layout = process_data_with_model(file, model, is_image) except UnknownModelException as e: raise HTTPException(status.HTTP_422_UNPROCESSABLE_ENTITY, str(e)) pages_layout = [ From 89a027f1ff98224f402665c743673a5ce9f7a01f Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 10:04:05 -0600 Subject: [PATCH 04/13] Add api test for images --- test_unstructured_inference/test_api.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/test_unstructured_inference/test_api.py b/test_unstructured_inference/test_api.py index 9680ca4f..c245693e 100644 --- a/test_unstructured_inference/test_api.py +++ b/test_unstructured_inference/test_api.py @@ -24,29 +24,33 @@ def __init__(self, *args, **kwargs): self.kwargs = kwargs -def test_layout_parsing_pdf_api(sample_pdf_content, tmpdir, monkeypatch): +@pytest.mark.parametrize("filetype, ext", [("pdf", "pdf"), ("image", "png")]) +def test_layout_parsing_api(monkeypatch, filetype, ext): monkeypatch.setattr(models, "load_model", lambda *args, **kwargs: MockModel(*args, **kwargs)) monkeypatch.setattr(models, "hf_hub_download", lambda *args, **kwargs: "fake-path") monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True) monkeypatch.setattr( DocumentLayout, "from_file", lambda *args, **kwargs: DocumentLayout.from_pages([]) ) + monkeypatch.setattr( + DocumentLayout, "from_image_file", lambda *args, **kwargs: DocumentLayout.from_pages([]) + ) - filename = os.path.join(tmpdir.dirname, "sample.pdf") - with open(filename, "w") as f: - f.write(sample_pdf_content) + filename = os.path.join("sample-docs", f"loremipsum.{ext}") client = TestClient(app) - response = client.post("/layout/pdf", files={"file": (filename, open(filename, "rb"))}) + response = client.post(f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))}) assert response.status_code == 200 response = client.post( - "/layout/pdf", files={"file": (filename, open(filename, "rb"))}, data={"model": "checkbox"} + f"/layout/{filetype}", + files={"file": (filename, open(filename, "rb"))}, + data={"model": "checkbox"}, ) assert response.status_code == 200 response = client.post( - "/layout/pdf", + f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))}, data={"model": "fake_model"}, ) From bf2fb4127970f0202ca188531c70f6b50565ac95 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 10:12:01 -0600 Subject: [PATCH 05/13] Remove unused fixture --- test_unstructured_inference/test_api.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test_unstructured_inference/test_api.py b/test_unstructured_inference/test_api.py index c245693e..01820d87 100644 --- a/test_unstructured_inference/test_api.py +++ b/test_unstructured_inference/test_api.py @@ -9,15 +9,6 @@ import unstructured_inference.models.detectron2 as detectron2 -@pytest.fixture -def sample_pdf_content(): - return """ - this is the content of a sample pdf file. - Title: ... - Author: ... - """ - - class MockModel: def __init__(self, *args, **kwargs): self.args = args From 31a19b76f2dfaca668556b2edbc1841613b5f306 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 10:12:26 -0600 Subject: [PATCH 06/13] Remove unused import --- unstructured_inference/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py index 450895a7..1581a28f 100644 --- a/unstructured_inference/api.py +++ b/unstructured_inference/api.py @@ -1,5 +1,5 @@ from fastapi import FastAPI, File, status, Request, UploadFile, Form, HTTPException -from unstructured_inference.inference.layout import process_data_with_model, DocumentLayout +from unstructured_inference.inference.layout import process_data_with_model from unstructured_inference.models import UnknownModelException from typing import List, BinaryIO, Optional, Union From c9e5502be8afab8da6f58772d32dc9cce8999d5f Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 10:12:41 -0600 Subject: [PATCH 07/13] Fix type --- unstructured_inference/inference/layout.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 2e1b6a25..c5b425f0 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -160,7 +160,7 @@ def _get_image_array(self) -> Union[np.ndarray, None]: def process_data_with_model( - data: BinaryIO, model_name: str, is_image: bool = False + data: BinaryIO, model_name: Optional[str], is_image: bool = False ) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a DocumentLayout by using a model identified by model_name.""" @@ -172,7 +172,7 @@ def process_data_with_model( def process_file_with_model( - filename: str, model_name: str, is_image: bool = False + filename: str, model_name: Optional[str], is_image: bool = False ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by model_name.""" From b093ddeea1375be7ec2b72bffabe6a19c7e98758 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 10:44:48 -0600 Subject: [PATCH 08/13] Combine routing --- unstructured_inference/api.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py index 1581a28f..cf65f07d 100644 --- a/unstructured_inference/api.py +++ b/unstructured_inference/api.py @@ -6,24 +6,20 @@ app = FastAPI() ALL_ELEMS = "_ALL" +VALID_FILETYPES = ["pdf", "image"] -@app.post("/layout/pdf") -async def layout_parsing_pdf( +@app.post("/layout/{filetype:path}") +async def layout_parsing( + filetype: str, file: UploadFile = File(), include_elems: List[str] = Form(default=ALL_ELEMS), model: str = Form(default=None), ): - return get_pages_layout(file.file, model, include_elems) - - -@app.post("/layout/image") -async def layout_parsing_image( - file: UploadFile = File(), - include_elems: List[str] = Form(default=ALL_ELEMS), - model: str = Form(default=None), -): - return get_pages_layout(file.file, model, include_elems, is_image=True) + if filetype not in VALID_FILETYPES: + raise HTTPException(status.HTTP_404_NOT_FOUND) + is_image = filetype == "image" + return get_pages_layout(file.file, model, include_elems, is_image=is_image) def get_pages_layout( From 5b93c537dab488a5b701016a8220f1b0b740d562 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 10:45:04 -0600 Subject: [PATCH 09/13] Check response code for bad route --- test_unstructured_inference/test_api.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_unstructured_inference/test_api.py b/test_unstructured_inference/test_api.py index 01820d87..7762be21 100644 --- a/test_unstructured_inference/test_api.py +++ b/test_unstructured_inference/test_api.py @@ -2,6 +2,7 @@ import os from fastapi.testclient import TestClient +from fastapi import HTTPException from unstructured_inference.api import app from unstructured_inference import models @@ -48,6 +49,13 @@ def test_layout_parsing_api(monkeypatch, filetype, ext): assert response.status_code == 422 +def test_bad_route_404(): + client = TestClient(app) + filename = os.path.join("sample-docs", f"loremipsum.pdf") + response = client.post("/layout/badroute", files={"file": (filename, open(filename, "rb"))}) + assert response.status_code == 404 + + def test_healthcheck(monkeypatch): client = TestClient(app) response = client.get("/healthcheck") From 3f15e65e48e52d1327b99c8ef4839615b8ed5187 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 11:00:54 -0600 Subject: [PATCH 10/13] With dynamic routing no need for factor --- unstructured_inference/api.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py index cf65f07d..b9b3f5d5 100644 --- a/unstructured_inference/api.py +++ b/unstructured_inference/api.py @@ -19,17 +19,8 @@ async def layout_parsing( if filetype not in VALID_FILETYPES: raise HTTPException(status.HTTP_404_NOT_FOUND) is_image = filetype == "image" - return get_pages_layout(file.file, model, include_elems, is_image=is_image) - - -def get_pages_layout( - file: BinaryIO, - model: Optional[str], - include_elems: Union[List[str], str] = ALL_ELEMS, - is_image=False, -): try: - layout = process_data_with_model(file, model, is_image) + layout = process_data_with_model(file.file, model, is_image) except UnknownModelException as e: raise HTTPException(status.HTTP_422_UNPROCESSABLE_ENTITY, str(e)) pages_layout = [ From f7be30b5d6aafc957b65fb4b8a2876e7f9008e1b Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 12:48:34 -0600 Subject: [PATCH 11/13] Add missing docstring --- unstructured_inference/inference/layout.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index c5b425f0..9a8bff91 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -196,6 +196,7 @@ def cid_ratio(text: str) -> float: def is_cid_present(text: str) -> bool: + """Checks if a cid code is present in a text selection.""" if len(text) < len("(cid:x)"): return False return text.find("(cid:") != -1 From 646be823faebef37216a2cb41780801ad33b6d13 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 12:55:45 -0600 Subject: [PATCH 12/13] Remove unused types --- unstructured_inference/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py index b9b3f5d5..628d2619 100644 --- a/unstructured_inference/api.py +++ b/unstructured_inference/api.py @@ -1,7 +1,7 @@ from fastapi import FastAPI, File, status, Request, UploadFile, Form, HTTPException from unstructured_inference.inference.layout import process_data_with_model from unstructured_inference.models import UnknownModelException -from typing import List, BinaryIO, Optional, Union +from typing import List app = FastAPI() From a7983a629a8ab394f497c3a3a916c2a12520ad80 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 12 Jan 2023 12:56:44 -0600 Subject: [PATCH 13/13] fix linting --- test_unstructured_inference/test_api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test_unstructured_inference/test_api.py b/test_unstructured_inference/test_api.py index 7762be21..fb33d5e7 100644 --- a/test_unstructured_inference/test_api.py +++ b/test_unstructured_inference/test_api.py @@ -2,7 +2,6 @@ import os from fastapi.testclient import TestClient -from fastapi import HTTPException from unstructured_inference.api import app from unstructured_inference import models @@ -51,7 +50,7 @@ def test_layout_parsing_api(monkeypatch, filetype, ext): def test_bad_route_404(): client = TestClient(app) - filename = os.path.join("sample-docs", f"loremipsum.pdf") + filename = os.path.join("sample-docs", "loremipsum.pdf") response = client.post("/layout/badroute", files={"file": (filename, open(filename, "rb"))}) assert response.status_code == 404