diff --git a/CHANGELOG.md b/CHANGELOG.md index 784e2ca1..1b308e7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.0.42-dev1 +## 0.0.42 * Update readme for parameter `hi_res_model_name` * Fix a bug using `hi_res_model_name` in parallel mode +* Bump unstructured library to 0.10.12 +* Bump unstructured-inference to 0.5.22 ## 0.0.41 diff --git a/requirements/base.txt b/requirements/base.txt index 5e1c3acf..165d312a 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -8,13 +8,14 @@ antlr4-python3-runtime==4.9.3 # via omegaconf anyio==3.7.1 # via + # fastapi # starlette # watchfiles attrs==23.1.0 # via # jsonschema # referencing -autoflake==2.2.0 +autoflake==2.2.1 # via unstructured-api-tools beautifulsoup4==4.12.2 # via @@ -46,6 +47,8 @@ cryptography==41.0.3 # via pdfminer-six cycler==0.11.0 # via matplotlib +dataclasses-json==0.6.0 + # via unstructured defusedxml==0.7.1 # via nbconvert ebooklib==0.18 @@ -58,7 +61,7 @@ et-xmlfile==1.1.0 # via openpyxl exceptiongroup==1.1.3 # via anyio -fastapi==0.103.0 +fastapi==0.103.1 # via unstructured-api-tools fastjsonschema==2.18.0 # via nbformat @@ -73,7 +76,7 @@ flatbuffers==23.5.26 # via onnxruntime fonttools==4.42.1 # via matplotlib -fsspec==2023.6.0 +fsspec==2023.9.0 # via huggingface-hub h11==0.14.0 # via uvicorn @@ -129,6 +132,8 @@ markupsafe==2.1.3 # via # jinja2 # nbconvert +marshmallow==3.20.1 + # via dataclasses-json matplotlib==3.7.2 # via pycocotools mistune==3.0.1 @@ -140,7 +145,9 @@ msg-parser==1.2.0 mypy==1.5.1 # via unstructured-api-tools mypy-extensions==1.0.0 - # via mypy + # via + # mypy + # typing-inspect nbclient==0.8.0 # via nbconvert nbconvert==7.8.0 @@ -158,6 +165,7 @@ numpy==1.25.2 # contourpy # layoutparser # matplotlib + # onnx # onnxruntime # opencv-python # pandas @@ -169,6 +177,8 @@ olefile==0.46 # via msg-parser omegaconf==2.3.0 # via effdet +onnx==1.14.1 + # via unstructured-inference onnxruntime==1.15.1 # via unstructured-inference opencv-python==4.8.0.76 @@ -180,12 +190,13 @@ openpyxl==3.1.2 packaging==23.1 # via # huggingface-hub + # marshmallow # matplotlib # nbconvert # onnxruntime # pytesseract # transformers -pandas==2.0.3 +pandas==2.1.0 # via # layoutparser # unstructured @@ -202,7 +213,7 @@ pdfminer-six==20221105 # unstructured pdfplumber==0.10.2 # via layoutparser -pillow==9.5.0 +pillow==10.0.0 # via # layoutparser # matplotlib @@ -211,13 +222,14 @@ pillow==9.5.0 # pytesseract # python-pptx # torchvision - # unstructured platformdirs==3.10.0 # via jupyter-core portalocker==2.7.0 # via iopath protobuf==4.24.2 - # via onnxruntime + # via + # onnx + # onnxruntime psutil==5.9.5 # via -r requirements/base.in pycocotools==2.0.7 @@ -238,7 +250,7 @@ pypandoc==1.11 # via unstructured pyparsing==3.0.9 # via matplotlib -pypdf==3.15.4 +pypdf==3.15.5 # via -r requirements/base.in pypdfium2==4.19.0 # via pdfplumber @@ -259,9 +271,9 @@ python-multipart==0.0.6 # via # unstructured-api-tools # unstructured-inference -python-pptx==0.6.22 +python-pptx==0.6.21 # via unstructured -pytz==2023.3 +pytz==2023.3.post1 # via pandas pyyaml==6.0.1 # via @@ -290,7 +302,7 @@ requests==2.31.0 # torchvision # transformers # unstructured -rpds-py==0.10.0 +rpds-py==0.10.2 # via # jsonschema # referencing @@ -308,7 +320,7 @@ six==1.16.0 # python-dateutil sniffio==1.3.0 # via anyio -soupsieve==2.4.1 +soupsieve==2.5 # via beautifulsoup4 starlette==0.27.0 # via fastapi @@ -318,7 +330,7 @@ sympy==1.12 # torch tabulate==0.9.0 # via unstructured -timm==0.9.5 +timm==0.9.7 # via effdet tinycss2==1.2.1 # via nbconvert @@ -354,7 +366,7 @@ traitlets==5.9.0 # nbclient # nbconvert # nbformat -transformers==4.32.1 +transformers==4.33.0 # via unstructured-inference types-requests==2.31.0.2 # via unstructured-api-tools @@ -369,17 +381,21 @@ typing-extensions==4.7.1 # huggingface-hub # iopath # mypy + # onnx # pydantic # pydantic-core # torch + # typing-inspect # uvicorn +typing-inspect==0.9.0 + # via dataclasses-json tzdata==2023.3 # via pandas -unstructured[local-inference]==0.10.8 +unstructured[local-inference]==0.10.12 # via -r requirements/base.in unstructured-api-tools==0.10.11 # via -r requirements/base.in -unstructured-inference==0.5.17 +unstructured-inference==0.5.22 # via unstructured urllib3==2.0.4 # via requests diff --git a/requirements/test.txt b/requirements/test.txt index 92423e2c..7dc2614a 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -11,6 +11,7 @@ antlr4-python3-runtime==4.9.3 anyio==3.7.1 # via # -r requirements/base.txt + # fastapi # httpcore # jupyter-server # starlette @@ -25,7 +26,7 @@ argon2-cffi-bindings==21.2.0 # via argon2-cffi arrow==1.2.3 # via isoduration -asttokens==2.2.1 +asttokens==2.4.0 # via # nbdev # stack-data @@ -38,7 +39,7 @@ attrs==23.1.0 # -r requirements/base.txt # jsonschema # referencing -autoflake==2.2.0 +autoflake==2.2.1 # via # -r requirements/base.txt # unstructured-api-tools @@ -107,6 +108,10 @@ cycler==0.11.0 # via # -r requirements/base.txt # matplotlib +dataclasses-json==0.6.0 + # via + # -r requirements/base.txt + # unstructured debugpy==1.6.7.post1 # via ipykernel decorator==5.1.1 @@ -135,12 +140,13 @@ exceptiongroup==1.1.3 # via # -r requirements/base.txt # anyio + # ipython # pytest execnb==0.1.5 # via nbdev executing==1.2.0 # via stack-data -fastapi==0.103.0 +fastapi==0.103.1 # via # -r requirements/base.txt # unstructured-api-tools @@ -175,7 +181,7 @@ fonttools==4.42.1 # matplotlib fqdn==1.5.1 # via jsonschema -fsspec==2023.6.0 +fsspec==2023.9.0 # via # -r requirements/base.txt # huggingface-hub @@ -217,13 +223,13 @@ iopath==0.1.10 # via # -r requirements/base.txt # layoutparser -ipykernel==6.25.1 +ipykernel==6.25.2 # via # jupyter # jupyter-console # jupyterlab # qtconsole -ipython==8.14.0 +ipython==8.15.0 # via # execnb # ipykernel @@ -292,7 +298,7 @@ jupyter-events==0.7.0 # via jupyter-server jupyter-lsp==2.2.0 # via jupyterlab -jupyter-server==2.7.2 +jupyter-server==2.7.3 # via # jupyter-lsp # jupyterlab @@ -337,6 +343,10 @@ markupsafe==2.1.3 # -r requirements/base.txt # jinja2 # nbconvert +marshmallow==3.20.1 + # via + # -r requirements/base.txt + # dataclasses-json matplotlib==3.7.2 # via # -r requirements/base.txt @@ -369,6 +379,7 @@ mypy-extensions==1.0.0 # -r requirements/base.txt # black # mypy + # typing-inspect nbclient==0.8.0 # via # -r requirements/base.txt @@ -397,7 +408,7 @@ nltk==3.8.1 # via # -r requirements/base.txt # unstructured -notebook==7.0.2 +notebook==7.0.3 # via jupyter notebook-shim==0.2.3 # via @@ -409,6 +420,7 @@ numpy==1.25.2 # contourpy # layoutparser # matplotlib + # onnx # onnxruntime # opencv-python # pandas @@ -424,6 +436,10 @@ omegaconf==2.3.0 # via # -r requirements/base.txt # effdet +onnx==1.14.1 + # via + # -r requirements/base.txt + # unstructured-inference onnxruntime==1.15.1 # via # -r requirements/base.txt @@ -450,6 +466,7 @@ packaging==23.1 # jupyter-server # jupyterlab # jupyterlab-server + # marshmallow # matplotlib # nbconvert # onnxruntime @@ -458,7 +475,7 @@ packaging==23.1 # qtconsole # qtpy # transformers -pandas==2.0.3 +pandas==2.1.0 # via # -r requirements/base.txt # layoutparser @@ -490,7 +507,7 @@ pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pillow==9.5.0 +pillow==10.0.0 # via # -r requirements/base.txt # layoutparser @@ -500,7 +517,6 @@ pillow==9.5.0 # pytesseract # python-pptx # torchvision - # unstructured platformdirs==3.10.0 # via # -r requirements/base.txt @@ -521,6 +537,7 @@ prompt-toolkit==3.0.39 protobuf==4.24.2 # via # -r requirements/base.txt + # onnx # onnxruntime psutil==5.9.5 # via @@ -568,7 +585,7 @@ pyparsing==3.0.9 # via # -r requirements/base.txt # matplotlib -pypdf==3.15.4 +pypdf==3.15.5 # via -r requirements/base.txt pypdfium2==4.19.0 # via @@ -578,7 +595,7 @@ pytesseract==0.3.10 # via # -r requirements/base.txt # layoutparser -pytest==7.4.0 +pytest==7.4.1 # via # pytest-cov # pytest-mock @@ -612,11 +629,11 @@ python-multipart==0.0.6 # -r requirements/base.txt # unstructured-api-tools # unstructured-inference -python-pptx==0.6.22 +python-pptx==0.6.21 # via # -r requirements/base.txt # unstructured -pytz==2023.3 +pytz==2023.3.post1 # via # -r requirements/base.txt # pandas @@ -639,7 +656,7 @@ pyzmq==25.1.1 # jupyter-console # jupyter-server # qtconsole -qtconsole==5.4.3 +qtconsole==5.4.4 # via jupyter qtpy==2.4.0 # via qtconsole @@ -672,7 +689,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.10.0 +rpds-py==0.10.2 # via # -r requirements/base.txt # jsonschema @@ -704,7 +721,7 @@ sniffio==1.3.0 # anyio # httpcore # httpx -soupsieve==2.4.1 +soupsieve==2.5 # via # -r requirements/base.txt # beautifulsoup4 @@ -727,7 +744,7 @@ terminado==0.17.1 # via # jupyter-server # jupyter-server-terminals -timm==0.9.5 +timm==0.9.7 # via # -r requirements/base.txt # effdet @@ -795,7 +812,7 @@ traitlets==5.9.0 # nbconvert # nbformat # qtconsole -transformers==4.32.1 +transformers==4.33.0 # via # -r requirements/base.txt # unstructured-inference @@ -820,18 +837,24 @@ typing-extensions==4.7.1 # huggingface-hub # iopath # mypy + # onnx # pydantic # torch + # typing-inspect # uvicorn +typing-inspect==0.9.0 + # via + # -r requirements/base.txt + # dataclasses-json tzdata==2023.3 # via # -r requirements/base.txt # pandas -unstructured[local-inference]==0.10.8 +unstructured[local-inference]==0.10.12 # via -r requirements/base.txt unstructured-api-tools==0.10.11 # via -r requirements/base.txt -unstructured-inference==0.5.17 +unstructured-inference==0.5.22 # via # -r requirements/base.txt # unstructured diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index 14d0030b..871760bf 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -2,7 +2,6 @@ import io import pytest -import re import requests import pandas as pd from fastapi.testclient import TestClient @@ -277,31 +276,12 @@ def test_xml_keep_tags_param(): data={"xml_keep_tags": "true", "strategy": "hi_res"}, ) assert response.status_code == 200 - response_with_xml_tags = response.json()[3:] # skip the initial encoding tag(s) - # The responses should have the same content except for the xml tags - response_with_xml_tags_index, response_without_xml_tags_index = 0, 0 - while response_without_xml_tags_index < len(response_without_xml_tags): - xml_tagged_line = response_with_xml_tags[response_with_xml_tags_index]["text"] - assert xml_tagged_line.startswith("<") - assert xml_tagged_line.endswith(">") - - # if there is content on this line, ensure it matches the content on the non tagged line - xml_tagged_line_content = xml_tagged_line.split(">", 1)[1] # remove opening tag - if not xml_tagged_line_content: - response_with_xml_tags_index += 1 - - else: - xml_tagged_line_content = xml_tagged_line_content.split("<", 1)[0] # remove closing tag - - xml_untagged_line = response_without_xml_tags[response_without_xml_tags_index]["text"] - xml_tagged_line_content_parsed = re.sub( - "&", "&", xml_tagged_line_content - ) # xml_keep_tags does not currently parse the inner content - assert xml_tagged_line_content_parsed == xml_untagged_line - - response_with_xml_tags_index += 1 - response_without_xml_tags_index += 1 + # xml_keep_tags returns one element with the full xml + # Just assert the tags are still present + response_with_xml_tags = response.json()[0] + for element in response_without_xml_tags: + assert element["text"].replace("&", "&") in response_with_xml_tags["text"] def test_include_page_breaks_param():