From 7d8487721e2a3efe42b7975ab833df970992314d Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Fri, 8 Sep 2023 12:17:30 -0700 Subject: [PATCH 01/16] Dockerfile updated with a conditional to decide which paddlepaddle wheel to install --- Dockerfile | 9 +- docs/requirements.txt | 8 +- requirements/base.txt | 4 +- requirements/build.txt | 8 +- requirements/dev.txt | 36 +---- requirements/extra-csv.txt | 6 +- requirements/extra-docx.txt | 2 +- requirements/extra-epub.txt | 2 +- requirements/extra-markdown.txt | 6 +- requirements/extra-msg.txt | 2 +- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.in | 4 + requirements/extra-paddleocr.txt | 209 ++++++++++++++++++++++++++ requirements/extra-pandoc.txt | 2 +- requirements/extra-pdf-image.txt | 12 +- requirements/extra-pptx.txt | 4 +- requirements/extra-xlsx.txt | 6 +- requirements/huggingface.txt | 4 +- requirements/ingest-airtable.txt | 2 +- requirements/ingest-azure.txt | 4 +- requirements/ingest-biomed.txt | 2 +- requirements/ingest-box.txt | 4 +- requirements/ingest-confluence.txt | 4 +- requirements/ingest-delta-table.txt | 2 +- requirements/ingest-discord.txt | 2 +- requirements/ingest-dropbox.txt | 2 +- requirements/ingest-elasticsearch.txt | 2 +- requirements/ingest-gcs.txt | 4 +- requirements/ingest-github.txt | 2 +- requirements/ingest-gitlab.txt | 2 +- requirements/ingest-google-drive.txt | 4 +- requirements/ingest-notion.txt | 2 +- requirements/ingest-onedrive.txt | 4 +- requirements/ingest-outlook.txt | 4 +- requirements/ingest-reddit.txt | 2 +- requirements/ingest-s3.txt | 6 +- requirements/ingest-salesforce.txt | 15 +- requirements/ingest-sharepoint.txt | 4 +- requirements/ingest-slack.txt | 2 +- requirements/ingest-wikipedia.txt | 2 +- requirements/test.txt | 10 +- 41 files changed, 297 insertions(+), 116 deletions(-) create mode 100644 requirements/extra-paddleocr.in create mode 100644 requirements/extra-paddleocr.txt diff --git a/Dockerfile b/Dockerfile index ed9265b043..ef31c8182f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,8 @@ FROM base as deps # Copy and install Unstructured COPY requirements requirements -RUN python3.10 -m pip install pip==${PIP_VERSION} && \ +RUN export ARCH=$(uname -m) && \ + python3.10 -m pip install pip==${PIP_VERSION} && \ dnf -y groupinstall "Development Tools" && \ pip install --no-cache -r requirements/base.txt && \ pip install --no-cache -r requirements/test.txt && \ @@ -50,10 +51,16 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} && \ pip install --no-cache -r requirements/extra-markdown.txt && \ pip install --no-cache -r requirements/extra-msg.txt && \ pip install --no-cache -r requirements/extra-odt.txt && \ + pip install --no-cache -r requirements/extra-paddleocr.txt && \ pip install --no-cache -r requirements/extra-pandoc.txt && \ pip install --no-cache -r requirements/extra-pdf-image.txt && \ pip install --no-cache -r requirements/extra-pptx.txt && \ pip install --no-cache -r requirements/extra-xlsx.txt && \ + # aarch64 requires a custom build of paddlepaddle + if [ "$ARCH" == "aarch64" ]; \ + then pip install --no-cache unstructured.paddlepaddle; \ + else pip install --no-cache paddlepaddle; \ + fi && \ dnf -y groupremove "Development Tools" && \ dnf clean all diff --git a/docs/requirements.txt b/docs/requirements.txt index 7202df621d..0628f19401 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/build.in @@ -35,8 +35,6 @@ idna==3.4 # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.8.0 - # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.3 @@ -50,8 +48,6 @@ pygments==2.16.1 # furo # sphinx # sphinx-tabs -pytz==2023.3 - # via babel requests==2.31.0 # via # -c requirements/base.txt @@ -105,5 +101,3 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests -zipp==3.16.2 - # via importlib-metadata diff --git a/requirements/base.txt b/requirements/base.txt index c39a0b392d..aa285039cf 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/base.in @@ -16,7 +16,7 @@ charset-normalizer==3.2.0 # via requests click==8.1.7 # via nltk -dataclasses-json==0.5.14 +dataclasses-json==0.6.0 # via -r requirements/base.in emoji==2.8.0 # via -r requirements/base.in diff --git a/requirements/build.txt b/requirements/build.txt index 7202df621d..0628f19401 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/build.in @@ -35,8 +35,6 @@ idna==3.4 # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.8.0 - # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.3 @@ -50,8 +48,6 @@ pygments==2.16.1 # furo # sphinx # sphinx-tabs -pytz==2023.3 - # via babel requests==2.31.0 # via # -c requirements/base.txt @@ -105,5 +101,3 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests -zipp==3.16.2 - # via importlib-metadata diff --git a/requirements/dev.txt b/requirements/dev.txt index aac0db5170..b0e27ff69b 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/dev.in @@ -16,7 +16,7 @@ argon2-cffi-bindings==21.2.0 # via argon2-cffi arrow==1.2.3 # via isoduration -asttokens==2.3.0 +asttokens==2.4.0 # via stack-data async-lru==2.0.4 # via jupyterlab @@ -34,7 +34,7 @@ beautifulsoup4==4.12.2 # nbconvert bleach==6.0.0 # via nbconvert -build==1.0.0 +build==1.0.3 # via pip-tools certifi==2023.7.22 # via @@ -60,7 +60,7 @@ comm==0.1.4 # via # ipykernel # ipywidgets -debugpy==1.6.7.post1 +debugpy==1.7.0 # via ipykernel decorator==5.1.1 # via ipython @@ -89,20 +89,7 @@ idna==3.4 # anyio # jsonschema # requests -importlib-metadata==6.8.0 - # via - # build - # jupyter-client - # jupyter-lsp - # jupyterlab - # jupyterlab-server - # nbconvert -importlib-resources==6.0.1 - # via - # jsonschema - # jsonschema-specifications - # jupyterlab -ipykernel==6.25.1 +ipykernel==6.25.2 # via # jupyter # jupyter-console @@ -241,8 +228,6 @@ pickleshare==0.7.5 # via ipython pip-tools==7.3.0 # via -r requirements/dev.in -pkgutil-resolve-name==1.3.10 - # via jsonschema platformdirs==3.10.0 # via # -c requirements/test.txt @@ -281,8 +266,6 @@ python-dateutil==2.8.2 # jupyter-client python-json-logger==2.0.7 # via jupyter-events -pytz==2023.3 - # via babel pyyaml==6.0.1 # via # -c requirements/test.txt @@ -317,7 +300,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.10.0 +rpds-py==0.10.2 # via # jsonschema # referencing @@ -382,7 +365,6 @@ typing-extensions==4.7.1 # -c requirements/test.txt # async-lru # filelock - # ipython uri-template==1.3.0 # via jsonschema urllib3==1.26.16 @@ -391,7 +373,7 @@ urllib3==1.26.16 # -c requirements/constraints.in # -c requirements/test.txt # requests -virtualenv==20.24.4 +virtualenv==20.24.5 # via pre-commit wcwidth==0.2.6 # via prompt-toolkit @@ -409,10 +391,6 @@ wheel==0.41.2 # pip-tools widgetsnbextension==4.0.8 # via ipywidgets -zipp==3.16.2 - # via - # importlib-metadata - # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 49c05b1513..09fd95766b 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-csv.in @@ -8,11 +8,11 @@ numpy==1.24.4 # via # -c requirements/constraints.in # pandas -pandas==2.0.3 +pandas==2.1.0 # via -r requirements/extra-csv.in python-dateutil==2.8.2 # via pandas -pytz==2023.3 +pytz==2023.3.post1 # via pandas six==1.16.0 # via python-dateutil diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 7e83c2bdac..94a125a309 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-docx.in diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 6b461bb847..4bed66d387 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 57e2210d43..a6bb94dcd6 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -1,12 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-markdown.in # -importlib-metadata==6.8.0 - # via markdown markdown==3.4.4 # via -r requirements/extra-markdown.in -zipp==3.16.2 - # via importlib-metadata diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt index 2ab29747ea..257c1d17cb 100644 --- a/requirements/extra-msg.txt +++ b/requirements/extra-msg.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-msg.in diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index ea84eaa7c6..2a2a20f2bf 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-odt.in diff --git a/requirements/extra-paddleocr.in b/requirements/extra-paddleocr.in new file mode 100644 index 0000000000..739a708ba5 --- /dev/null +++ b/requirements/extra-paddleocr.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt + +unstructured.paddleocr==2.6.1.2 \ No newline at end of file diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt new file mode 100644 index 0000000000..26e934f7b5 --- /dev/null +++ b/requirements/extra-paddleocr.txt @@ -0,0 +1,209 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements/extra-paddleocr.in +# +attrdict==2.0.1 + # via unstructured-paddleocr +babel==2.12.1 + # via flask-babel +bce-python-sdk==0.8.90 + # via visualdl +blinker==1.6.2 + # via flask +cachetools==5.3.1 + # via premailer +certifi==2023.7.22 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +charset-normalizer==3.2.0 + # via + # -c requirements/base.txt + # requests +click==8.1.7 + # via + # -c requirements/base.txt + # flask +contourpy==1.1.0 + # via matplotlib +cssselect==1.2.0 + # via premailer +cssutils==2.7.1 + # via premailer +cycler==0.11.0 + # via matplotlib +cython==3.0.2 + # via unstructured-paddleocr +et-xmlfile==1.1.0 + # via openpyxl +flask==2.3.3 + # via + # flask-babel + # visualdl +flask-babel==3.1.0 + # via visualdl +fonttools==4.42.1 + # via matplotlib +future==0.18.3 + # via bce-python-sdk +idna==3.4 + # via + # -c requirements/base.txt + # requests +imageio==2.31.3 + # via + # imgaug + # scikit-image +imgaug==0.4.0 + # via unstructured-paddleocr +itsdangerous==2.1.2 + # via flask +jinja2==3.1.2 + # via + # flask + # flask-babel +kiwisolver==1.4.5 + # via matplotlib +lanms-neo==1.0.2 + # via unstructured-paddleocr +lazy-loader==0.3 + # via scikit-image +lmdb==1.4.1 + # via unstructured-paddleocr +lxml==4.9.3 + # via + # -c requirements/base.txt + # premailer + # unstructured-paddleocr +markupsafe==2.1.3 + # via + # jinja2 + # werkzeug +matplotlib==3.7.2 + # via + # imgaug + # visualdl +networkx==3.1 + # via scikit-image +numpy==1.24.4 + # via + # -c requirements/constraints.in + # contourpy + # imageio + # imgaug + # matplotlib + # opencv-contrib-python + # opencv-python + # pandas + # pywavelets + # scikit-image + # scipy + # shapely + # tifffile + # unstructured-paddleocr + # visualdl +opencv-contrib-python==4.6.0.66 + # via unstructured-paddleocr +opencv-python==4.6.0.66 + # via + # imgaug + # unstructured-paddleocr +openpyxl==3.1.2 + # via unstructured-paddleocr +packaging==23.1 + # via + # -c requirements/base.txt + # matplotlib + # scikit-image + # visualdl +pandas==2.1.0 + # via visualdl +pdf2image==1.16.3 + # via unstructured-paddleocr +pillow==10.0.0 + # via + # imageio + # imgaug + # matplotlib + # pdf2image + # scikit-image + # visualdl +polygon3==3.0.9.1 + # via unstructured-paddleocr +premailer==3.10.0 + # via unstructured-paddleocr +protobuf==4.23.4 + # via + # -c requirements/constraints.in + # visualdl +psutil==5.9.5 + # via visualdl +pyclipper==1.3.0.post5 + # via unstructured-paddleocr +pycryptodome==3.18.0 + # via bce-python-sdk +pyparsing==3.0.9 + # via + # -c requirements/constraints.in + # matplotlib +python-dateutil==2.8.2 + # via + # matplotlib + # pandas +pytz==2023.3.post1 + # via + # flask-babel + # pandas +pywavelets==1.4.1 + # via scikit-image +rapidfuzz==3.2.0 + # via unstructured-paddleocr +rarfile==4.0 + # via visualdl +requests==2.31.0 + # via + # -c requirements/base.txt + # premailer + # visualdl +scikit-image==0.21.0 + # via + # imgaug + # unstructured-paddleocr +scipy==1.10.1 + # via + # -c requirements/constraints.in + # imgaug + # scikit-image +shapely==2.0.1 + # via + # imgaug + # unstructured-paddleocr +six==1.16.0 + # via + # attrdict + # bce-python-sdk + # imgaug + # python-dateutil + # visualdl +tifffile==2023.8.30 + # via scikit-image +tqdm==4.66.1 + # via + # -c requirements/base.txt + # unstructured-paddleocr +tzdata==2023.3 + # via pandas +unstructured-paddleocr==2.6.1.2 + # via -r requirements/extra-paddleocr.in +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +visualdl==2.5.3 + # via unstructured-paddleocr +werkzeug==2.3.7 + # via flask diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index f96028331b..e9136dd091 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 906523efd4..a13cdaa135 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-pdf-image.in @@ -50,8 +50,6 @@ idna==3.4 # via # -c requirements/base.txt # requests -importlib-resources==6.0.1 - # via matplotlib iopath==0.1.10 # via layoutparser jinja2==3.1.2 @@ -100,7 +98,7 @@ packaging==23.1 # onnxruntime # pytesseract # transformers -pandas==2.0.3 +pandas==2.1.0 # via layoutparser pdf2image==1.16.3 # via @@ -145,7 +143,7 @@ python-dateutil==2.8.2 # pandas python-multipart==0.0.6 # via unstructured-inference -pytz==2023.3 +pytz==2023.3.post1 # via pandas pyyaml==6.0.1 # via @@ -200,7 +198,7 @@ tqdm==4.66.1 # huggingface-hub # iopath # transformers -transformers==4.32.1 +transformers==4.33.1 # via unstructured-inference typing-extensions==4.7.1 # via @@ -221,5 +219,3 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests -zipp==3.16.2 - # via importlib-resources diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 3506bbea72..b2b78f3b18 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-pptx.in @@ -10,5 +10,5 @@ pillow==10.0.0 # via python-pptx python-pptx==0.6.21 # via -r requirements/extra-pptx.in -xlsxwriter==3.1.2 +xlsxwriter==3.1.3 # via python-pptx diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index f29e3b339e..0e3189cce3 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/extra-xlsx.in @@ -12,11 +12,11 @@ numpy==1.24.4 # pandas openpyxl==3.1.2 # via -r requirements/extra-xlsx.in -pandas==2.0.3 +pandas==2.1.0 # via -r requirements/extra-xlsx.in python-dateutil==2.8.2 # via pandas -pytz==2023.3 +pytz==2023.3.post1 # via pandas six==1.16.0 # via python-dateutil diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index a766131db9..d1c32a500c 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/huggingface.in @@ -91,7 +91,7 @@ tqdm==4.66.1 # huggingface-hub # sacremoses # transformers -transformers==4.32.1 +transformers==4.33.1 # via -r requirements/huggingface.in typing-extensions==4.7.1 # via diff --git a/requirements/ingest-airtable.txt b/requirements/ingest-airtable.txt index d5cf713014..85886c4bad 100644 --- a/requirements/ingest-airtable.txt +++ b/requirements/ingest-airtable.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-airtable.in diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 3e3a32e3b6..0b313cc2a8 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-azure.in @@ -14,7 +14,7 @@ async-timeout==4.0.3 # via aiohttp attrs==23.1.0 # via aiohttp -azure-core==1.29.3 +azure-core==1.29.4 # via # adlfs # azure-identity diff --git a/requirements/ingest-biomed.txt b/requirements/ingest-biomed.txt index 069b11a6ab..ccb2d5ca4a 100644 --- a/requirements/ingest-biomed.txt +++ b/requirements/ingest-biomed.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-biomed.in diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index 54ff34a70c..a0d2e5b7ce 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-box.in @@ -8,7 +8,7 @@ attrs==23.1.0 # via boxsdk boxfs==0.2.1 # via -r requirements/ingest-box.in -boxsdk[jwt]==3.8.1 +boxsdk[jwt]==3.9.0 # via boxfs certifi==2023.7.22 # via diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index 5c30a166b4..ef59cf61d0 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -1,10 +1,10 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-confluence.in # -atlassian-python-api==3.41.1 +atlassian-python-api==3.41.2 # via -r requirements/ingest-confluence.in certifi==2023.7.22 # via diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt index d654af0e0a..b1af318b3a 100644 --- a/requirements/ingest-delta-table.txt +++ b/requirements/ingest-delta-table.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-delta-table.in diff --git a/requirements/ingest-discord.txt b/requirements/ingest-discord.txt index 72d0250723..130557e46a 100644 --- a/requirements/ingest-discord.txt +++ b/requirements/ingest-discord.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-discord.in diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index d76cd2f172..41bbd69812 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-dropbox.in diff --git a/requirements/ingest-elasticsearch.txt b/requirements/ingest-elasticsearch.txt index 89f91ad54b..767f1c45c1 100644 --- a/requirements/ingest-elasticsearch.txt +++ b/requirements/ingest-elasticsearch.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-elasticsearch.in diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index c21313d55e..dcafacc255 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-gcs.in @@ -61,7 +61,7 @@ google-cloud-storage==2.10.0 # via gcsfs google-crc32c==1.5.0 # via google-resumable-media -google-resumable-media==2.5.0 +google-resumable-media==2.6.0 # via google-cloud-storage googleapis-common-protos==1.60.0 # via google-api-core diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index 575360470e..709822d08d 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-github.in diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index dbff64042c..9a938e3fc4 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-gitlab.in diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 4512f466d1..a2132aa928 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-google-drive.in @@ -17,7 +17,7 @@ charset-normalizer==3.2.0 # requests google-api-core==2.11.1 # via google-api-python-client -google-api-python-client==2.97.0 +google-api-python-client==2.98.0 # via -r requirements/ingest-google-drive.in google-auth==2.22.0 # via diff --git a/requirements/ingest-notion.txt b/requirements/ingest-notion.txt index a5cb1c36a7..430e4f251f 100644 --- a/requirements/ingest-notion.txt +++ b/requirements/ingest-notion.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-notion.in diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index b98a2d9d0a..482178a0a5 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-onedrive.in @@ -41,7 +41,7 @@ pycparser==2.21 # via cffi pyjwt[crypto]==2.8.0 # via msal -pytz==2023.3 +pytz==2023.3.post1 # via office365-rest-python-client requests==2.31.0 # via diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index 869b24d468..c83bb6d26d 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-outlook.in @@ -35,7 +35,7 @@ pycparser==2.21 # via cffi pyjwt[crypto]==2.8.0 # via msal -pytz==2023.3 +pytz==2023.3.post1 # via office365-rest-python-client requests==2.31.0 # via diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index bc42ef8ece..0d02afb8d5 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-reddit.in diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index 41b66ac491..6df6d42c6d 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-s3.in @@ -48,10 +48,6 @@ s3fs==2023.9.0 # via -r requirements/ingest-s3.in six==1.16.0 # via python-dateutil -typing-extensions==4.7.1 - # via - # -c requirements/base.txt - # aioitertools urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt index 3072f4eb57..de7049b8c1 100644 --- a/requirements/ingest-salesforce.txt +++ b/requirements/ingest-salesforce.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-salesforce.in @@ -29,14 +29,22 @@ lxml==4.9.3 # via # -c requirements/base.txt # zeep +more-itertools==10.1.0 + # via simple-salesforce +pendulum==2.1.2 + # via simple-salesforce platformdirs==3.10.0 # via zeep pycparser==2.21 # via cffi pyjwt==2.8.0 # via simple-salesforce -pytz==2023.3 +python-dateutil==2.8.2 + # via pendulum +pytz==2023.3.post1 # via zeep +pytzdata==2020.1 + # via pendulum requests==2.31.0 # via # -c requirements/base.txt @@ -48,11 +56,12 @@ requests-file==1.5.1 # via zeep requests-toolbelt==1.0.0 # via zeep -simple-salesforce==1.12.4 +simple-salesforce==1.12.5 # via -r requirements/ingest-salesforce.in six==1.16.0 # via # isodate + # python-dateutil # requests-file urllib3==1.26.16 # via diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt index 075349801c..0a4e3da200 100644 --- a/requirements/ingest-sharepoint.txt +++ b/requirements/ingest-sharepoint.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-sharepoint.in @@ -35,7 +35,7 @@ pycparser==2.21 # via cffi pyjwt[crypto]==2.8.0 # via msal -pytz==2023.3 +pytz==2023.3.post1 # via office365-rest-python-client requests==2.31.0 # via diff --git a/requirements/ingest-slack.txt b/requirements/ingest-slack.txt index b8c94147ba..42dc8ad984 100644 --- a/requirements/ingest-slack.txt +++ b/requirements/ingest-slack.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-slack.in diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index e391f0156e..7d179556a4 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/ingest-wikipedia.in diff --git a/requirements/test.txt b/requirements/test.txt index a0ebcd83c3..9a0eb38b82 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.8 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/test.in @@ -22,7 +22,7 @@ click==8.1.7 # -c requirements/base.txt # -r requirements/test.in # black -coverage[toml]==7.3.0 +coverage[toml]==7.3.1 # via # -r requirements/test.in # pytest-cov @@ -32,7 +32,7 @@ flake8==6.1.0 # via -r requirements/test.in freezegun==1.2.2 # via -r requirements/test.in -grpcio==1.57.0 +grpcio==1.58.0 # via -r requirements/test.in idna==3.4 # via @@ -81,7 +81,7 @@ pydantic==1.10.12 # label-studio-sdk pyflakes==3.1.0 # via flake8 -pytest==7.4.1 +pytest==7.4.2 # via # pytest-cov # pytest-mock @@ -120,7 +120,6 @@ types-urllib3==1.26.25.14 typing-extensions==4.7.1 # via # -c requirements/base.txt - # black # mypy # pydantic urllib3==1.26.16 @@ -128,7 +127,6 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests - # vcrpy vcrpy==5.1.0 # via -r requirements/test.in wrapt==1.15.0 From c3181c9382866a6a1d39a66c2e393e650a50fa3c Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Fri, 8 Sep 2023 13:10:26 -0700 Subject: [PATCH 02/16] Bump version, add makefile command for non docker installs --- CHANGELOG.md | 5 +++-- Makefile | 3 +++ scripts/install-paddleocr.sh | 8 ++++++++ unstructured/__version__.py | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) create mode 100755 scripts/install-paddleocr.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index fbf5a00915..3808f93ce6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -## 0.10.13-dev1 +## 0.10.13-dev2 ### Enhancements * Updated documentation: Added back support doc types for partitioning, more Python codes in the API page, RAG definition, and use case. +* Installs PaddleOCR in Docker images including support for aarch64 architecture ### Fixes @@ -28,7 +29,7 @@ * Bump unstructured-inference * Avoid divide-by-zero errors swith `safe_division` (0.5.21) - + ## 0.10.11 ### Enhancements diff --git a/Makefile b/Makefile index 147fd77db8..81d5376bff 100644 --- a/Makefile +++ b/Makefile @@ -204,6 +204,9 @@ install-local-inference: install install-all-docs install-pandoc: ARCH=${ARCH} ./scripts/install-pandoc.sh +.PHONY: install-paddleocr +install-paddleocr: + ARCH=${ARCH} ./scripts/install-paddleocr.sh ## pip-compile: compiles all base/dev/test requirements .PHONY: pip-compile diff --git a/scripts/install-paddleocr.sh b/scripts/install-paddleocr.sh new file mode 100755 index 0000000000..f0257edba7 --- /dev/null +++ b/scripts/install-paddleocr.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +if [ "${ARCH}" = "aarch64" ]; then + python3 -m pip install unstructured.paddlepaddle; +else + python3 -m pip install paddlepaddle; +fi +python3 -m pip install paddleocr \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9497f9dce4..f219b96686 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.13-dev1" # pragma: no cover +__version__ = "0.10.13-dev2" # pragma: no cover From 550866e3e431c6b85a185f174878d41b3dab27ea Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Fri, 8 Sep 2023 14:23:12 -0700 Subject: [PATCH 03/16] Removing dockerfile command and adding script to install --- Dockerfile | 8 +------- scripts/install-paddleocr.sh | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index ef31c8182f..b34711b077 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,8 +22,7 @@ FROM base as deps # Copy and install Unstructured COPY requirements requirements -RUN export ARCH=$(uname -m) && \ - python3.10 -m pip install pip==${PIP_VERSION} && \ +RUN python3.10 -m pip install pip==${PIP_VERSION} && \ dnf -y groupinstall "Development Tools" && \ pip install --no-cache -r requirements/base.txt && \ pip install --no-cache -r requirements/test.txt && \ @@ -56,11 +55,6 @@ RUN export ARCH=$(uname -m) && \ pip install --no-cache -r requirements/extra-pdf-image.txt && \ pip install --no-cache -r requirements/extra-pptx.txt && \ pip install --no-cache -r requirements/extra-xlsx.txt && \ - # aarch64 requires a custom build of paddlepaddle - if [ "$ARCH" == "aarch64" ]; \ - then pip install --no-cache unstructured.paddlepaddle; \ - else pip install --no-cache paddlepaddle; \ - fi && \ dnf -y groupremove "Development Tools" && \ dnf clean all diff --git a/scripts/install-paddleocr.sh b/scripts/install-paddleocr.sh index f0257edba7..4ec1917e07 100755 --- a/scripts/install-paddleocr.sh +++ b/scripts/install-paddleocr.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +# aarch64 requires a custom build of paddlepaddle if [ "${ARCH}" = "aarch64" ]; then python3 -m pip install unstructured.paddlepaddle; else From 5d175b1c016326dc2e1a708914cf39a56e4e8090 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Fri, 8 Sep 2023 14:39:38 -0700 Subject: [PATCH 04/16] Revert pip-compile to use python3.8 This reverts commit 7d8487721e2a3efe42b7975ab833df970992314d. --- Dockerfile | 1 - docs/requirements.txt | 8 +++++++- requirements/base.txt | 2 +- requirements/build.txt | 8 +++++++- requirements/dev.txt | 24 +++++++++++++++++++++++- requirements/extra-csv.txt | 4 ++-- requirements/extra-docx.txt | 2 +- requirements/extra-epub.txt | 2 +- requirements/extra-markdown.txt | 6 +++++- requirements/extra-msg.txt | 2 +- requirements/extra-odt.txt | 2 +- requirements/extra-paddleocr.txt | 15 ++++++++++++--- requirements/extra-pandoc.txt | 2 +- requirements/extra-pdf-image.txt | 8 ++++++-- requirements/extra-pptx.txt | 2 +- requirements/extra-xlsx.txt | 4 ++-- requirements/huggingface.txt | 2 +- requirements/ingest-airtable.txt | 2 +- requirements/ingest-azure.txt | 2 +- requirements/ingest-biomed.txt | 2 +- requirements/ingest-box.txt | 2 +- requirements/ingest-confluence.txt | 2 +- requirements/ingest-delta-table.txt | 2 +- requirements/ingest-discord.txt | 2 +- requirements/ingest-dropbox.txt | 2 +- requirements/ingest-elasticsearch.txt | 2 +- requirements/ingest-gcs.txt | 2 +- requirements/ingest-github.txt | 2 +- requirements/ingest-gitlab.txt | 2 +- requirements/ingest-google-drive.txt | 2 +- requirements/ingest-notion.txt | 2 +- requirements/ingest-onedrive.txt | 2 +- requirements/ingest-outlook.txt | 2 +- requirements/ingest-reddit.txt | 2 +- requirements/ingest-s3.txt | 6 +++++- requirements/ingest-salesforce.txt | 2 +- requirements/ingest-sharepoint.txt | 2 +- requirements/ingest-slack.txt | 2 +- requirements/ingest-wikipedia.txt | 2 +- requirements/test.txt | 4 +++- 40 files changed, 101 insertions(+), 45 deletions(-) diff --git a/Dockerfile b/Dockerfile index b34711b077..ed9265b043 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,7 +50,6 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} && \ pip install --no-cache -r requirements/extra-markdown.txt && \ pip install --no-cache -r requirements/extra-msg.txt && \ pip install --no-cache -r requirements/extra-odt.txt && \ - pip install --no-cache -r requirements/extra-paddleocr.txt && \ pip install --no-cache -r requirements/extra-pandoc.txt && \ pip install --no-cache -r requirements/extra-pdf-image.txt && \ pip install --no-cache -r requirements/extra-pptx.txt && \ diff --git a/docs/requirements.txt b/docs/requirements.txt index 0628f19401..a258b2e8ce 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/build.in @@ -35,6 +35,8 @@ idna==3.4 # requests imagesize==1.4.1 # via sphinx +importlib-metadata==6.8.0 + # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.3 @@ -48,6 +50,8 @@ pygments==2.16.1 # furo # sphinx # sphinx-tabs +pytz==2023.3.post1 + # via babel requests==2.31.0 # via # -c requirements/base.txt @@ -101,3 +105,5 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests +zipp==3.16.2 + # via importlib-metadata diff --git a/requirements/base.txt b/requirements/base.txt index aa285039cf..300e1df499 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/base.in diff --git a/requirements/build.txt b/requirements/build.txt index 0628f19401..a258b2e8ce 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/build.in @@ -35,6 +35,8 @@ idna==3.4 # requests imagesize==1.4.1 # via sphinx +importlib-metadata==6.8.0 + # via sphinx jinja2==3.1.2 # via sphinx markupsafe==2.1.3 @@ -48,6 +50,8 @@ pygments==2.16.1 # furo # sphinx # sphinx-tabs +pytz==2023.3.post1 + # via babel requests==2.31.0 # via # -c requirements/base.txt @@ -101,3 +105,5 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests +zipp==3.16.2 + # via importlib-metadata diff --git a/requirements/dev.txt b/requirements/dev.txt index b0e27ff69b..36591e794a 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/dev.in @@ -89,6 +89,19 @@ idna==3.4 # anyio # jsonschema # requests +importlib-metadata==6.8.0 + # via + # build + # jupyter-client + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # nbconvert +importlib-resources==6.0.1 + # via + # jsonschema + # jsonschema-specifications + # jupyterlab ipykernel==6.25.2 # via # jupyter @@ -228,6 +241,8 @@ pickleshare==0.7.5 # via ipython pip-tools==7.3.0 # via -r requirements/dev.in +pkgutil-resolve-name==1.3.10 + # via jsonschema platformdirs==3.10.0 # via # -c requirements/test.txt @@ -266,6 +281,8 @@ python-dateutil==2.8.2 # jupyter-client python-json-logger==2.0.7 # via jupyter-events +pytz==2023.3.post1 + # via babel pyyaml==6.0.1 # via # -c requirements/test.txt @@ -365,6 +382,7 @@ typing-extensions==4.7.1 # -c requirements/test.txt # async-lru # filelock + # ipython uri-template==1.3.0 # via jsonschema urllib3==1.26.16 @@ -391,6 +409,10 @@ wheel==0.41.2 # pip-tools widgetsnbextension==4.0.8 # via ipywidgets +zipp==3.16.2 + # via + # importlib-metadata + # importlib-resources # The following packages are considered to be unsafe in a requirements file: # pip diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 09fd95766b..f7328b8549 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-csv.in @@ -8,7 +8,7 @@ numpy==1.24.4 # via # -c requirements/constraints.in # pandas -pandas==2.1.0 +pandas==2.0.3 # via -r requirements/extra-csv.in python-dateutil==2.8.2 # via pandas diff --git a/requirements/extra-docx.txt b/requirements/extra-docx.txt index 94a125a309..7e83c2bdac 100644 --- a/requirements/extra-docx.txt +++ b/requirements/extra-docx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-docx.in diff --git a/requirements/extra-epub.txt b/requirements/extra-epub.txt index 4bed66d387..6b461bb847 100644 --- a/requirements/extra-epub.txt +++ b/requirements/extra-epub.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-epub.in diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index a6bb94dcd6..57e2210d43 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -1,8 +1,12 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-markdown.in # +importlib-metadata==6.8.0 + # via markdown markdown==3.4.4 # via -r requirements/extra-markdown.in +zipp==3.16.2 + # via importlib-metadata diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt index 257c1d17cb..2ab29747ea 100644 --- a/requirements/extra-msg.txt +++ b/requirements/extra-msg.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-msg.in diff --git a/requirements/extra-odt.txt b/requirements/extra-odt.txt index 2a2a20f2bf..ea84eaa7c6 100644 --- a/requirements/extra-odt.txt +++ b/requirements/extra-odt.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-odt.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 26e934f7b5..c9a2ae0a2b 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-paddleocr.in @@ -59,6 +59,10 @@ imageio==2.31.3 # scikit-image imgaug==0.4.0 # via unstructured-paddleocr +importlib-metadata==6.8.0 + # via flask +importlib-resources==6.0.1 + # via matplotlib itsdangerous==2.1.2 # via flask jinja2==3.1.2 @@ -119,7 +123,7 @@ packaging==23.1 # matplotlib # scikit-image # visualdl -pandas==2.1.0 +pandas==2.0.3 # via visualdl pdf2image==1.16.3 # via unstructured-paddleocr @@ -155,6 +159,7 @@ python-dateutil==2.8.2 # pandas pytz==2023.3.post1 # via + # babel # flask-babel # pandas pywavelets==1.4.1 @@ -188,7 +193,7 @@ six==1.16.0 # imgaug # python-dateutil # visualdl -tifffile==2023.8.30 +tifffile==2023.7.10 # via scikit-image tqdm==4.66.1 # via @@ -207,3 +212,7 @@ visualdl==2.5.3 # via unstructured-paddleocr werkzeug==2.3.7 # via flask +zipp==3.16.2 + # via + # importlib-metadata + # importlib-resources diff --git a/requirements/extra-pandoc.txt b/requirements/extra-pandoc.txt index e9136dd091..f96028331b 100644 --- a/requirements/extra-pandoc.txt +++ b/requirements/extra-pandoc.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-pandoc.in diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index a13cdaa135..903616fe31 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-pdf-image.in @@ -50,6 +50,8 @@ idna==3.4 # via # -c requirements/base.txt # requests +importlib-resources==6.0.1 + # via matplotlib iopath==0.1.10 # via layoutparser jinja2==3.1.2 @@ -98,7 +100,7 @@ packaging==23.1 # onnxruntime # pytesseract # transformers -pandas==2.1.0 +pandas==2.0.3 # via layoutparser pdf2image==1.16.3 # via @@ -219,3 +221,5 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests +zipp==3.16.2 + # via importlib-resources diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index b2b78f3b18..06450e9a16 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-pptx.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index 0e3189cce3..8e53261369 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/extra-xlsx.in @@ -12,7 +12,7 @@ numpy==1.24.4 # pandas openpyxl==3.1.2 # via -r requirements/extra-xlsx.in -pandas==2.1.0 +pandas==2.0.3 # via -r requirements/extra-xlsx.in python-dateutil==2.8.2 # via pandas diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index d1c32a500c..735e4200d6 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/huggingface.in diff --git a/requirements/ingest-airtable.txt b/requirements/ingest-airtable.txt index 85886c4bad..d5cf713014 100644 --- a/requirements/ingest-airtable.txt +++ b/requirements/ingest-airtable.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-airtable.in diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 0b313cc2a8..6dad298740 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-azure.in diff --git a/requirements/ingest-biomed.txt b/requirements/ingest-biomed.txt index ccb2d5ca4a..069b11a6ab 100644 --- a/requirements/ingest-biomed.txt +++ b/requirements/ingest-biomed.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-biomed.in diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index a0d2e5b7ce..10ee274964 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-box.in diff --git a/requirements/ingest-confluence.txt b/requirements/ingest-confluence.txt index ef59cf61d0..54fbde5f58 100644 --- a/requirements/ingest-confluence.txt +++ b/requirements/ingest-confluence.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-confluence.in diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt index b1af318b3a..d654af0e0a 100644 --- a/requirements/ingest-delta-table.txt +++ b/requirements/ingest-delta-table.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-delta-table.in diff --git a/requirements/ingest-discord.txt b/requirements/ingest-discord.txt index 130557e46a..72d0250723 100644 --- a/requirements/ingest-discord.txt +++ b/requirements/ingest-discord.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-discord.in diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index 41bbd69812..d76cd2f172 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-dropbox.in diff --git a/requirements/ingest-elasticsearch.txt b/requirements/ingest-elasticsearch.txt index 767f1c45c1..89f91ad54b 100644 --- a/requirements/ingest-elasticsearch.txt +++ b/requirements/ingest-elasticsearch.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-elasticsearch.in diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index dcafacc255..f258108fa5 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-gcs.in diff --git a/requirements/ingest-github.txt b/requirements/ingest-github.txt index 709822d08d..575360470e 100644 --- a/requirements/ingest-github.txt +++ b/requirements/ingest-github.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-github.in diff --git a/requirements/ingest-gitlab.txt b/requirements/ingest-gitlab.txt index 9a938e3fc4..dbff64042c 100644 --- a/requirements/ingest-gitlab.txt +++ b/requirements/ingest-gitlab.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-gitlab.in diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index a2132aa928..0e39470be5 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-google-drive.in diff --git a/requirements/ingest-notion.txt b/requirements/ingest-notion.txt index 430e4f251f..a5cb1c36a7 100644 --- a/requirements/ingest-notion.txt +++ b/requirements/ingest-notion.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-notion.in diff --git a/requirements/ingest-onedrive.txt b/requirements/ingest-onedrive.txt index 482178a0a5..623f4306cd 100644 --- a/requirements/ingest-onedrive.txt +++ b/requirements/ingest-onedrive.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-onedrive.in diff --git a/requirements/ingest-outlook.txt b/requirements/ingest-outlook.txt index c83bb6d26d..3ee8246eb6 100644 --- a/requirements/ingest-outlook.txt +++ b/requirements/ingest-outlook.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-outlook.in diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index 0d02afb8d5..bc42ef8ece 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-reddit.in diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index 6df6d42c6d..41b66ac491 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-s3.in @@ -48,6 +48,10 @@ s3fs==2023.9.0 # via -r requirements/ingest-s3.in six==1.16.0 # via python-dateutil +typing-extensions==4.7.1 + # via + # -c requirements/base.txt + # aioitertools urllib3==1.26.16 # via # -c requirements/base.txt diff --git a/requirements/ingest-salesforce.txt b/requirements/ingest-salesforce.txt index de7049b8c1..dceb918bec 100644 --- a/requirements/ingest-salesforce.txt +++ b/requirements/ingest-salesforce.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-salesforce.in diff --git a/requirements/ingest-sharepoint.txt b/requirements/ingest-sharepoint.txt index 0a4e3da200..fe5a7bf00e 100644 --- a/requirements/ingest-sharepoint.txt +++ b/requirements/ingest-sharepoint.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-sharepoint.in diff --git a/requirements/ingest-slack.txt b/requirements/ingest-slack.txt index 42dc8ad984..b8c94147ba 100644 --- a/requirements/ingest-slack.txt +++ b/requirements/ingest-slack.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-slack.in diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index 7d179556a4..e391f0156e 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/ingest-wikipedia.in diff --git a/requirements/test.txt b/requirements/test.txt index 9a0eb38b82..6cec9620e8 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.8 # by the following command: # # pip-compile requirements/test.in @@ -120,6 +120,7 @@ types-urllib3==1.26.25.14 typing-extensions==4.7.1 # via # -c requirements/base.txt + # black # mypy # pydantic urllib3==1.26.16 @@ -127,6 +128,7 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests + # vcrpy vcrpy==5.1.0 # via -r requirements/test.in wrapt==1.15.0 From d593e376152a2cc862df4c13e6fdaac081674a97 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Fri, 8 Sep 2023 14:41:38 -0700 Subject: [PATCH 05/16] add to setup.py extra --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 12980d3748..00978acbf7 100644 --- a/setup.py +++ b/setup.py @@ -152,6 +152,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List # Legacy extra requirements "huggingface": load_requirements("requirements/huggingface.in"), "local-inference": all_doc_reqs, + "paddleocr": load_requirements("requirements/paddleocr.in"), }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt"]}, From fad1e460a3a6eed88b93837f5d0a6d7ea9ee973a Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Fri, 8 Sep 2023 14:46:34 -0700 Subject: [PATCH 06/16] add nl --- scripts/install-paddleocr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install-paddleocr.sh b/scripts/install-paddleocr.sh index 4ec1917e07..0415cc8e48 100755 --- a/scripts/install-paddleocr.sh +++ b/scripts/install-paddleocr.sh @@ -6,4 +6,4 @@ if [ "${ARCH}" = "aarch64" ]; then else python3 -m pip install paddlepaddle; fi -python3 -m pip install paddleocr \ No newline at end of file +python3 -m pip install paddleocr From 2876387d8644bf06bea3db6527741ea952159729 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Mon, 11 Sep 2023 11:25:57 -0700 Subject: [PATCH 07/16] Resolve PR comments --- CHANGELOG.md | 2 +- scripts/install-paddleocr.sh | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93177420ea..2e0dcdd19c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ ### Enhancements * Updated documentation: Added back support doc types for partitioning, more Python codes in the API page, RAG definition, and use case. -* Installs PaddleOCR in Docker images including support for aarch64 architecture +* Add paddleocr extra and makefile command to install * Updated Hi-Res Metadata: PDFs and Images using Hi-Res strategy now have layout model class probabilities added ot metadata. * Updated the `_detect_filetype_from_octet_stream()` function to use libmagic to infer the content type of file when it is not a zip file. ### Features diff --git a/scripts/install-paddleocr.sh b/scripts/install-paddleocr.sh index 0415cc8e48..8db79acfb1 100755 --- a/scripts/install-paddleocr.sh +++ b/scripts/install-paddleocr.sh @@ -6,4 +6,4 @@ if [ "${ARCH}" = "aarch64" ]; then else python3 -m pip install paddlepaddle; fi -python3 -m pip install paddleocr +python3 -m pip install unstructured.paddleocr diff --git a/setup.py b/setup.py index 425d7a24e7..407d94fdc9 100644 --- a/setup.py +++ b/setup.py @@ -153,7 +153,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List # Legacy extra requirements "huggingface": load_requirements("requirements/huggingface.in"), "local-inference": all_doc_reqs, - "paddleocr": load_requirements("requirements/paddleocr.in"), + "paddleocr": load_requirements("requirements/extra-paddleocr.in"), }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt"]}, From 384e6f89f0e58aa981c9255234d6c0eda7ce03a2 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Mon, 11 Sep 2023 12:09:36 -0700 Subject: [PATCH 08/16] bump version --- CHANGELOG.md | 2 +- requirements/extra-paddleocr.in | 2 +- unstructured/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d0a829f16..f179406868 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.15-dev1 +## 0.10.15-dev2 ### Enhancements diff --git a/requirements/extra-paddleocr.in b/requirements/extra-paddleocr.in index 739a708ba5..b80062babf 100644 --- a/requirements/extra-paddleocr.in +++ b/requirements/extra-paddleocr.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt -unstructured.paddleocr==2.6.1.2 \ No newline at end of file +unstructured.paddleocr==2.6.1.2 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 45743f95ca..2d66edd34f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.15-dev1" # pragma: no cover +__version__ = "0.10.15-dev2" # pragma: no cover From 0fcc952ed31cbf2a61eac99bdd83aa86ab6aa703 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Mon, 11 Sep 2023 12:59:33 -0700 Subject: [PATCH 09/16] Update deps due to conflicts --- requirements/dev.txt | 2 +- requirements/extra-pdf-image.txt | 4 ++-- requirements/huggingface.txt | 2 +- requirements/ingest-delta-table.txt | 4 ++-- requirements/ingest-gcs.txt | 6 ++---- requirements/ingest-google-drive.txt | 8 ++------ requirements/ingest-notion.txt | 4 ++-- requirements/ingest-reddit.txt | 2 +- requirements/test.txt | 4 ++-- 9 files changed, 15 insertions(+), 21 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 36591e794a..3afe41bf47 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -401,7 +401,7 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.6.2 +websocket-client==1.6.3 # via jupyter-server wheel==0.41.2 # via diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 42a9964523..fc51ae05a4 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -39,7 +39,7 @@ fonttools==4.42.1 # via matplotlib fsspec==2023.9.0 # via huggingface-hub -huggingface-hub==0.16.4 +huggingface-hub==0.17.1 # via # timm # transformers @@ -135,7 +135,7 @@ pyparsing==3.0.9 # via # -c requirements/constraints.in # matplotlib -pypdfium2==4.19.0 +pypdfium2==4.20.0 # via pdfplumber pytesseract==0.3.10 # via layoutparser diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 735e4200d6..f032a0ef54 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -24,7 +24,7 @@ filelock==3.12.3 # transformers fsspec==2023.9.0 # via huggingface-hub -huggingface-hub==0.16.4 +huggingface-hub==0.17.1 # via transformers idna==3.4 # via diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt index d654af0e0a..b9eb20d1af 100644 --- a/requirements/ingest-delta-table.txt +++ b/requirements/ingest-delta-table.txt @@ -4,7 +4,7 @@ # # pip-compile requirements/ingest-delta-table.in # -deltalake==0.10.1 +deltalake==0.10.2 # via -r requirements/ingest-delta-table.in fsspec==2023.9.0 # via -r requirements/ingest-delta-table.in @@ -12,5 +12,5 @@ numpy==1.24.4 # via # -c requirements/constraints.in # pyarrow -pyarrow==13.0.0 +pyarrow==12.0.0 # via deltalake diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index f258108fa5..6bd792a4ac 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -46,14 +46,14 @@ google-api-core==2.11.1 # via # google-cloud-core # google-cloud-storage -google-auth==2.22.0 +google-auth==2.23.0 # via # gcsfs # google-api-core # google-auth-oauthlib # google-cloud-core # google-cloud-storage -google-auth-oauthlib==1.0.0 +google-auth-oauthlib==1.1.0 # via gcsfs google-cloud-core==2.3.3 # via google-cloud-storage @@ -98,8 +98,6 @@ requests-oauthlib==1.3.1 # via google-auth-oauthlib rsa==4.9 # via google-auth -six==1.16.0 - # via google-auth soupsieve==2.5 # via # -c requirements/base.txt diff --git a/requirements/ingest-google-drive.txt b/requirements/ingest-google-drive.txt index 0e39470be5..4c81f7d698 100644 --- a/requirements/ingest-google-drive.txt +++ b/requirements/ingest-google-drive.txt @@ -19,12 +19,12 @@ google-api-core==2.11.1 # via google-api-python-client google-api-python-client==2.98.0 # via -r requirements/ingest-google-drive.in -google-auth==2.22.0 +google-auth==2.23.0 # via # google-api-core # google-api-python-client # google-auth-httplib2 -google-auth-httplib2==0.1.0 +google-auth-httplib2==0.1.1 # via google-api-python-client googleapis-common-protos==1.60.0 # via google-api-core @@ -57,10 +57,6 @@ requests==2.31.0 # google-api-core rsa==4.9 # via google-auth -six==1.16.0 - # via - # google-auth - # google-auth-httplib2 uritemplate==4.1.1 # via google-api-python-client urllib3==1.26.16 diff --git a/requirements/ingest-notion.txt b/requirements/ingest-notion.txt index a5cb1c36a7..850b1d0256 100644 --- a/requirements/ingest-notion.txt +++ b/requirements/ingest-notion.txt @@ -18,9 +18,9 @@ h11==0.14.0 # via httpcore htmlbuilder==1.0.0 # via -r requirements/ingest-notion.in -httpcore==0.17.3 +httpcore==0.18.0 # via httpx -httpx==0.24.1 +httpx==0.25.0 # via notion-client idna==3.4 # via diff --git a/requirements/ingest-reddit.txt b/requirements/ingest-reddit.txt index bc42ef8ece..8c0b53b240 100644 --- a/requirements/ingest-reddit.txt +++ b/requirements/ingest-reddit.txt @@ -33,5 +33,5 @@ urllib3==1.26.16 # -c requirements/base.txt # -c requirements/constraints.in # requests -websocket-client==1.6.2 +websocket-client==1.6.3 # via praw diff --git a/requirements/test.txt b/requirements/test.txt index 6cec9620e8..1ad6968860 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,7 +6,7 @@ # appdirs==1.4.4 # via label-studio-tools -black==23.7.0 +black==23.9.1 # via -r requirements/test.in certifi==2023.7.22 # via @@ -97,7 +97,7 @@ requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.0.287 +ruff==0.0.288 # via -r requirements/test.in six==1.16.0 # via python-dateutil From a9cdfe0dfc0696159ef1da7a2e49fa541aa9688b Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Mon, 11 Sep 2023 15:27:40 -0700 Subject: [PATCH 10/16] Upgrade opencv-python version due to conflict --- requirements/extra-paddleocr.in | 2 +- requirements/extra-paddleocr.txt | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/extra-paddleocr.in b/requirements/extra-paddleocr.in index b80062babf..a42c551ead 100644 --- a/requirements/extra-paddleocr.in +++ b/requirements/extra-paddleocr.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt -unstructured.paddleocr==2.6.1.2 +unstructured.paddleocr==2.6.1.3 diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index c9a2ae0a2b..24e5357a7a 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -109,9 +109,9 @@ numpy==1.24.4 # tifffile # unstructured-paddleocr # visualdl -opencv-contrib-python==4.6.0.66 +opencv-contrib-python==4.8.0.76 # via unstructured-paddleocr -opencv-python==4.6.0.66 +opencv-python==4.8.0.76 # via # imgaug # unstructured-paddleocr @@ -201,7 +201,7 @@ tqdm==4.66.1 # unstructured-paddleocr tzdata==2023.3 # via pandas -unstructured-paddleocr==2.6.1.2 +unstructured-paddleocr==2.6.1.3 # via -r requirements/extra-paddleocr.in urllib3==1.26.16 # via From 0ca4da53964d712f994a217968a2424a894f7b2b Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Tue, 12 Sep 2023 11:06:35 -0700 Subject: [PATCH 11/16] bump version --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 2d66edd34f..93db63269a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.15-dev2" # pragma: no cover +__version__ = "0.10.15-dev3" # pragma: no cover From 954e1ae1bf8e1c6bc9e1e4c966d1c24b0f90133d Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Wed, 13 Sep 2023 16:24:16 -0700 Subject: [PATCH 12/16] linting fix --- unstructured/__version__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 455548e86f..fb932abdfd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1,2 +1 @@ __version__ = "0.10.15-dev9" # pragma: no cover - From 308ffcecf84f75a5295cbcb3fedc6835d6a2512b Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 15 Sep 2023 16:23:26 -0400 Subject: [PATCH 13/16] pin matplotlib==3.7.2 for paddle install --- requirements/constraints.in | 2 ++ requirements/dev.txt | 2 +- requirements/extra-paddleocr.txt | 5 +++-- requirements/extra-pdf-image.txt | 10 ++++++---- requirements/extra-pptx.txt | 2 +- requirements/huggingface.txt | 2 +- requirements/ingest-azure.txt | 2 +- requirements/ingest-box.txt | 2 +- requirements/ingest-delta-table.txt | 2 +- requirements/ingest-dropbox.txt | 2 +- 10 files changed, 18 insertions(+), 13 deletions(-) diff --git a/requirements/constraints.in b/requirements/constraints.in index 2d6fabb38d..27ea8baccb 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -32,3 +32,5 @@ safetensors<=0.3.2 # use the known compatible version of weaviate and unstructured.pytesseract unstructured.pytesseract>=0.3.12 weaviate-client==3.23.2 +# Note(yuming) - pining to avoid conflict with paddle install +matplotlib==3.7.2 diff --git a/requirements/dev.txt b/requirements/dev.txt index 6502a22ecd..691bb15686 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -176,7 +176,7 @@ jupyter-server==2.7.3 # notebook-shim jupyter-server-terminals==0.4.4 # via jupyter-server -jupyterlab==4.0.5 +jupyterlab==4.0.6 # via notebook jupyterlab-pygments==0.2.2 # via nbconvert diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 24e5357a7a..afd4927a47 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -88,6 +88,7 @@ markupsafe==2.1.3 # werkzeug matplotlib==3.7.2 # via + # -c requirements/constraints.in # imgaug # visualdl networkx==3.1 @@ -127,7 +128,7 @@ pandas==2.0.3 # via visualdl pdf2image==1.16.3 # via unstructured-paddleocr -pillow==10.0.0 +pillow==10.0.1 # via # imageio # imgaug @@ -164,7 +165,7 @@ pytz==2023.3.post1 # pandas pywavelets==1.4.1 # via scikit-image -rapidfuzz==3.2.0 +rapidfuzz==3.3.0 # via unstructured-paddleocr rarfile==4.0 # via visualdl diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 52ef843511..06ae89df15 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -37,7 +37,7 @@ flatbuffers==23.5.26 # via onnxruntime fonttools==4.42.1 # via matplotlib -fsspec==2023.9.0 +fsspec==2023.9.1 # via huggingface-hub huggingface-hub==0.17.1 # via @@ -62,8 +62,10 @@ layoutparser[layoutmodels,tesseract]==0.3.4 # via unstructured-inference markupsafe==2.1.3 # via jinja2 -matplotlib==3.7.3 - # via pycocotools +matplotlib==3.7.2 + # via + # -c requirements/constraints.in + # pycocotools mpmath==1.3.0 # via sympy networkx==3.1 @@ -113,7 +115,7 @@ pdfminer-six==20221105 # pdfplumber pdfplumber==0.10.2 # via layoutparser -pillow==10.0.0 +pillow==10.0.1 # via # layoutparser # matplotlib diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 06450e9a16..8a529bc940 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -6,7 +6,7 @@ # lxml==4.9.3 # via python-pptx -pillow==10.0.0 +pillow==10.0.1 # via python-pptx python-pptx==0.6.21 # via -r requirements/extra-pptx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 2f755d84b3..f3878298d2 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -22,7 +22,7 @@ filelock==3.12.4 # huggingface-hub # torch # transformers -fsspec==2023.9.0 +fsspec==2023.9.1 # via huggingface-hub huggingface-hub==0.17.1 # via transformers diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 4f423e965d..9fc26fff17 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -49,7 +49,7 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-azure.in # adlfs diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index bae5a522ab..f93c5a97cb 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -23,7 +23,7 @@ charset-normalizer==3.2.0 # requests cryptography==41.0.3 # via boxsdk -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-box.in # boxfs diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt index b9eb20d1af..0da49842e5 100644 --- a/requirements/ingest-delta-table.txt +++ b/requirements/ingest-delta-table.txt @@ -6,7 +6,7 @@ # deltalake==0.10.2 # via -r requirements/ingest-delta-table.in -fsspec==2023.9.0 +fsspec==2023.9.1 # via -r requirements/ingest-delta-table.in numpy==1.24.4 # via diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index d76cd2f172..b565096026 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -17,7 +17,7 @@ dropbox==11.36.2 # via dropboxdrivefs dropboxdrivefs==1.3.1 # via -r requirements/ingest-dropbox.in -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-dropbox.in # dropboxdrivefs From 779d5d09d23fe6ee133c103453918d2a3e2b0878 Mon Sep 17 00:00:00 2001 From: Trevor Bossert Date: Fri, 15 Sep 2023 15:43:33 -0700 Subject: [PATCH 14/16] compile with pinned fsspec version --- requirements/dev.txt | 2 +- requirements/extra-pdf-image.txt | 2 +- requirements/huggingface.txt | 2 +- requirements/ingest-azure.in | 2 +- requirements/ingest-box.in | 2 +- requirements/ingest-delta-table.in | 2 +- requirements/ingest-dropbox.in | 2 +- requirements/ingest-gcs.in | 2 +- requirements/ingest-gcs.txt | 4 ++-- requirements/ingest-s3.in | 2 +- requirements/ingest-s3.txt | 4 ++-- requirements/test.txt | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index 691bb15686..0845c4db04 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -80,7 +80,7 @@ filelock==3.12.4 # via virtualenv fqdn==1.5.1 # via jsonschema -identify==2.5.28 +identify==2.5.29 # via pre-commit idna==3.4 # via diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 06ae89df15..72784d9f6a 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -204,7 +204,7 @@ tqdm==4.66.1 # huggingface-hub # iopath # transformers -transformers==4.33.1 +transformers==4.33.2 # via unstructured-inference typing-extensions==4.7.1 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index f3878298d2..d4b93ce6f8 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -91,7 +91,7 @@ tqdm==4.66.1 # huggingface-hub # sacremoses # transformers -transformers==4.33.1 +transformers==4.33.2 # via -r requirements/huggingface.in typing-extensions==4.7.1 # via diff --git a/requirements/ingest-azure.in b/requirements/ingest-azure.in index d42acf96a4..ae60ef8cd4 100644 --- a/requirements/ingest-azure.in +++ b/requirements/ingest-azure.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt adlfs -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-box.in b/requirements/ingest-box.in index 58bbb4a5c1..d180b8d259 100644 --- a/requirements/ingest-box.in +++ b/requirements/ingest-box.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt boxfs -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-delta-table.in b/requirements/ingest-delta-table.in index 09703a9372..a60c0d52ef 100644 --- a/requirements/ingest-delta-table.in +++ b/requirements/ingest-delta-table.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt deltalake -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-dropbox.in b/requirements/ingest-dropbox.in index 365f182534..b6befc9bd6 100644 --- a/requirements/ingest-dropbox.in +++ b/requirements/ingest-dropbox.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt dropboxdrivefs -fsspec \ No newline at end of file +fsspec==2023.9.1 \ No newline at end of file diff --git a/requirements/ingest-gcs.in b/requirements/ingest-gcs.in index de522c3f43..8f63397360 100644 --- a/requirements/ingest-gcs.in +++ b/requirements/ingest-gcs.in @@ -1,5 +1,5 @@ -c constraints.in -c base.txt gcsfs -fsspec +fsspec==2023.9.1 bs4 diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 6bd792a4ac..44c8e081b0 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -36,11 +36,11 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-gcs.in # gcsfs -gcsfs==2023.9.0 +gcsfs==2023.9.1 # via -r requirements/ingest-gcs.in google-api-core==2.11.1 # via diff --git a/requirements/ingest-s3.in b/requirements/ingest-s3.in index c848714f96..f654805ee4 100644 --- a/requirements/ingest-s3.in +++ b/requirements/ingest-s3.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt s3fs -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index 41b66ac491..2f079fa29b 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -28,7 +28,7 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-s3.in # s3fs @@ -44,7 +44,7 @@ multidict==6.0.4 # yarl python-dateutil==2.8.2 # via botocore -s3fs==2023.9.0 +s3fs==2023.9.1 # via -r requirements/ingest-s3.in six==1.16.0 # via python-dateutil diff --git a/requirements/test.txt b/requirements/test.txt index 91d93117b5..4f5554e811 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -97,7 +97,7 @@ requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.0.289 +ruff==0.0.290 # via -r requirements/test.in six==1.16.0 # via python-dateutil From ead3748fae583ad7e1ac2b1365323dee22dd6d71 Mon Sep 17 00:00:00 2001 From: yuming <305248291@qq.com> Date: Fri, 15 Sep 2023 19:57:47 -0400 Subject: [PATCH 15/16] changelog nit --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e0c97e2c9..ccf6d25db1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,7 @@ * **Add delta table destination connector** New delta table destination connector added to ingest CLI. * **Rename to Source and Destination Connectors in the Documentation.** Maintain naming consistency between Connectors codebase and documentation with the first addition to a destination connector. * **Non-HTML text files now return unstructured-elements as opposed to HTML-elements.** Previously the text based files that went through `partition_html` would return HTML-elements but now we preserve the format from the input using `source_format` argument in the partition call. -* **Adds `PaddleOCR` as an optional alternative to `Tesseract` for processing PDF's, it is installable via `makefile` command `install-paddleocr` +* **Adds `PaddleOCR` as an optional alternative to `Tesseract`** for OCR in processing PDF or Image files, it is installable via `makefile` command `install-paddleocr` ### Features @@ -49,7 +49,7 @@ * Update all connectors to use new downstream architecture * New click type added to parse comma-delimited string inputs * Some CLI options renamed - + ### Features ### Fixes From c3e9222658b1656c872f58ff4cd920e44f52ea99 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Fri, 15 Sep 2023 17:04:32 -0700 Subject: [PATCH 16/16] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 591b0ac105..723e92b777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,7 @@ * **Add delta table destination connector** New delta table destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to a Delta Table. * **Rename to Source and Destination Connectors in the Documentation.** Maintain naming consistency between Connectors codebase and documentation with the first addition to a destination connector. * **Non-HTML text files now return unstructured-elements as opposed to HTML-elements.** Previously the text based files that went through `partition_html` would return HTML-elements but now we preserve the format from the input using `source_format` argument in the partition call. -* **Adds `PaddleOCR` as an optional alternative to `Tesseract`** for OCR in processing PDF or Image files, it is installable via `makefile` command `install-paddleocr` +* **Adds `PaddleOCR` as an optional alternative to `Tesseract`** for OCR in processing of PDF or Image files, it is installable via the `makefile` command `install-paddleocr`. For experimental purposes only. ### Features