diff --git a/CHANGELOG.md b/CHANGELOG.md index 26b9ad6eb7..723e92b777 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.10.15-dev15 + ### Enhancements * **Suport for better element categories from the next-generation image-to-text model ("chipper").**. Previously, not all of the classifications from Chipper were being mapped to proper `unstructured` element categories so the consumer of the library would see many `UncategorizedText` elements. This fixes the issue, improving the granularity of the element categories outputs for better downstream processing and chunking. The mapping update is: @@ -24,6 +25,7 @@ * **Add delta table destination connector** New delta table destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to a Delta Table. * **Rename to Source and Destination Connectors in the Documentation.** Maintain naming consistency between Connectors codebase and documentation with the first addition to a destination connector. * **Non-HTML text files now return unstructured-elements as opposed to HTML-elements.** Previously the text based files that went through `partition_html` would return HTML-elements but now we preserve the format from the input using `source_format` argument in the partition call. +* **Adds `PaddleOCR` as an optional alternative to `Tesseract`** for OCR in processing of PDF or Image files, it is installable via the `makefile` command `install-paddleocr`. For experimental purposes only. ### Features @@ -47,7 +49,7 @@ * Update all connectors to use new downstream architecture * New click type added to parse comma-delimited string inputs * Some CLI options renamed - + ### Features ### Fixes @@ -66,6 +68,7 @@ * Add Jira Connector to be able to pull issues from a Jira organization * Add `clean_ligatures` function to expand ligatures in text + ### Fixes * `partition_html` breaks on `
` elements. diff --git a/Makefile b/Makefile index d67d6ea4d0..19051c435b 100644 --- a/Makefile +++ b/Makefile @@ -214,6 +214,9 @@ install-local-inference: install install-all-docs install-pandoc: ARCH=${ARCH} ./scripts/install-pandoc.sh +.PHONY: install-paddleocr +install-paddleocr: + ARCH=${ARCH} ./scripts/install-paddleocr.sh ## pip-compile: compiles all base/dev/test requirements .PHONY: pip-compile diff --git a/requirements/constraints.in b/requirements/constraints.in index 2d6fabb38d..27ea8baccb 100644 --- a/requirements/constraints.in +++ b/requirements/constraints.in @@ -32,3 +32,5 @@ safetensors<=0.3.2 # use the known compatible version of weaviate and unstructured.pytesseract unstructured.pytesseract>=0.3.12 weaviate-client==3.23.2 +# Note(yuming) - pining to avoid conflict with paddle install +matplotlib==3.7.2 diff --git a/requirements/dev.txt b/requirements/dev.txt index 6502a22ecd..0845c4db04 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -80,7 +80,7 @@ filelock==3.12.4 # via virtualenv fqdn==1.5.1 # via jsonschema -identify==2.5.28 +identify==2.5.29 # via pre-commit idna==3.4 # via @@ -176,7 +176,7 @@ jupyter-server==2.7.3 # notebook-shim jupyter-server-terminals==0.4.4 # via jupyter-server -jupyterlab==4.0.5 +jupyterlab==4.0.6 # via notebook jupyterlab-pygments==0.2.2 # via nbconvert diff --git a/requirements/extra-paddleocr.in b/requirements/extra-paddleocr.in new file mode 100644 index 0000000000..a42c551ead --- /dev/null +++ b/requirements/extra-paddleocr.in @@ -0,0 +1,4 @@ +-c constraints.in +-c base.txt + +unstructured.paddleocr==2.6.1.3 diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt new file mode 100644 index 0000000000..afd4927a47 --- /dev/null +++ b/requirements/extra-paddleocr.txt @@ -0,0 +1,219 @@ +# +# This file is autogenerated by pip-compile with Python 3.8 +# by the following command: +# +# pip-compile requirements/extra-paddleocr.in +# +attrdict==2.0.1 + # via unstructured-paddleocr +babel==2.12.1 + # via flask-babel +bce-python-sdk==0.8.90 + # via visualdl +blinker==1.6.2 + # via flask +cachetools==5.3.1 + # via premailer +certifi==2023.7.22 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +charset-normalizer==3.2.0 + # via + # -c requirements/base.txt + # requests +click==8.1.7 + # via + # -c requirements/base.txt + # flask +contourpy==1.1.0 + # via matplotlib +cssselect==1.2.0 + # via premailer +cssutils==2.7.1 + # via premailer +cycler==0.11.0 + # via matplotlib +cython==3.0.2 + # via unstructured-paddleocr +et-xmlfile==1.1.0 + # via openpyxl +flask==2.3.3 + # via + # flask-babel + # visualdl +flask-babel==3.1.0 + # via visualdl +fonttools==4.42.1 + # via matplotlib +future==0.18.3 + # via bce-python-sdk +idna==3.4 + # via + # -c requirements/base.txt + # requests +imageio==2.31.3 + # via + # imgaug + # scikit-image +imgaug==0.4.0 + # via unstructured-paddleocr +importlib-metadata==6.8.0 + # via flask +importlib-resources==6.0.1 + # via matplotlib +itsdangerous==2.1.2 + # via flask +jinja2==3.1.2 + # via + # flask + # flask-babel +kiwisolver==1.4.5 + # via matplotlib +lanms-neo==1.0.2 + # via unstructured-paddleocr +lazy-loader==0.3 + # via scikit-image +lmdb==1.4.1 + # via unstructured-paddleocr +lxml==4.9.3 + # via + # -c requirements/base.txt + # premailer + # unstructured-paddleocr +markupsafe==2.1.3 + # via + # jinja2 + # werkzeug +matplotlib==3.7.2 + # via + # -c requirements/constraints.in + # imgaug + # visualdl +networkx==3.1 + # via scikit-image +numpy==1.24.4 + # via + # -c requirements/constraints.in + # contourpy + # imageio + # imgaug + # matplotlib + # opencv-contrib-python + # opencv-python + # pandas + # pywavelets + # scikit-image + # scipy + # shapely + # tifffile + # unstructured-paddleocr + # visualdl +opencv-contrib-python==4.8.0.76 + # via unstructured-paddleocr +opencv-python==4.8.0.76 + # via + # imgaug + # unstructured-paddleocr +openpyxl==3.1.2 + # via unstructured-paddleocr +packaging==23.1 + # via + # -c requirements/base.txt + # matplotlib + # scikit-image + # visualdl +pandas==2.0.3 + # via visualdl +pdf2image==1.16.3 + # via unstructured-paddleocr +pillow==10.0.1 + # via + # imageio + # imgaug + # matplotlib + # pdf2image + # scikit-image + # visualdl +polygon3==3.0.9.1 + # via unstructured-paddleocr +premailer==3.10.0 + # via unstructured-paddleocr +protobuf==4.23.4 + # via + # -c requirements/constraints.in + # visualdl +psutil==5.9.5 + # via visualdl +pyclipper==1.3.0.post5 + # via unstructured-paddleocr +pycryptodome==3.18.0 + # via bce-python-sdk +pyparsing==3.0.9 + # via + # -c requirements/constraints.in + # matplotlib +python-dateutil==2.8.2 + # via + # matplotlib + # pandas +pytz==2023.3.post1 + # via + # babel + # flask-babel + # pandas +pywavelets==1.4.1 + # via scikit-image +rapidfuzz==3.3.0 + # via unstructured-paddleocr +rarfile==4.0 + # via visualdl +requests==2.31.0 + # via + # -c requirements/base.txt + # premailer + # visualdl +scikit-image==0.21.0 + # via + # imgaug + # unstructured-paddleocr +scipy==1.10.1 + # via + # -c requirements/constraints.in + # imgaug + # scikit-image +shapely==2.0.1 + # via + # imgaug + # unstructured-paddleocr +six==1.16.0 + # via + # attrdict + # bce-python-sdk + # imgaug + # python-dateutil + # visualdl +tifffile==2023.7.10 + # via scikit-image +tqdm==4.66.1 + # via + # -c requirements/base.txt + # unstructured-paddleocr +tzdata==2023.3 + # via pandas +unstructured-paddleocr==2.6.1.3 + # via -r requirements/extra-paddleocr.in +urllib3==1.26.16 + # via + # -c requirements/base.txt + # -c requirements/constraints.in + # requests +visualdl==2.5.3 + # via unstructured-paddleocr +werkzeug==2.3.7 + # via flask +zipp==3.16.2 + # via + # importlib-metadata + # importlib-resources diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 52ef843511..72784d9f6a 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -37,7 +37,7 @@ flatbuffers==23.5.26 # via onnxruntime fonttools==4.42.1 # via matplotlib -fsspec==2023.9.0 +fsspec==2023.9.1 # via huggingface-hub huggingface-hub==0.17.1 # via @@ -62,8 +62,10 @@ layoutparser[layoutmodels,tesseract]==0.3.4 # via unstructured-inference markupsafe==2.1.3 # via jinja2 -matplotlib==3.7.3 - # via pycocotools +matplotlib==3.7.2 + # via + # -c requirements/constraints.in + # pycocotools mpmath==1.3.0 # via sympy networkx==3.1 @@ -113,7 +115,7 @@ pdfminer-six==20221105 # pdfplumber pdfplumber==0.10.2 # via layoutparser -pillow==10.0.0 +pillow==10.0.1 # via # layoutparser # matplotlib @@ -202,7 +204,7 @@ tqdm==4.66.1 # huggingface-hub # iopath # transformers -transformers==4.33.1 +transformers==4.33.2 # via unstructured-inference typing-extensions==4.7.1 # via diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 06450e9a16..8a529bc940 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -6,7 +6,7 @@ # lxml==4.9.3 # via python-pptx -pillow==10.0.0 +pillow==10.0.1 # via python-pptx python-pptx==0.6.21 # via -r requirements/extra-pptx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 2f755d84b3..d4b93ce6f8 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -22,7 +22,7 @@ filelock==3.12.4 # huggingface-hub # torch # transformers -fsspec==2023.9.0 +fsspec==2023.9.1 # via huggingface-hub huggingface-hub==0.17.1 # via transformers @@ -91,7 +91,7 @@ tqdm==4.66.1 # huggingface-hub # sacremoses # transformers -transformers==4.33.1 +transformers==4.33.2 # via -r requirements/huggingface.in typing-extensions==4.7.1 # via diff --git a/requirements/ingest-azure.in b/requirements/ingest-azure.in index d42acf96a4..ae60ef8cd4 100644 --- a/requirements/ingest-azure.in +++ b/requirements/ingest-azure.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt adlfs -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-azure.txt b/requirements/ingest-azure.txt index 4f423e965d..9fc26fff17 100644 --- a/requirements/ingest-azure.txt +++ b/requirements/ingest-azure.txt @@ -49,7 +49,7 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-azure.in # adlfs diff --git a/requirements/ingest-box.in b/requirements/ingest-box.in index 58bbb4a5c1..d180b8d259 100644 --- a/requirements/ingest-box.in +++ b/requirements/ingest-box.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt boxfs -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-box.txt b/requirements/ingest-box.txt index bae5a522ab..f93c5a97cb 100644 --- a/requirements/ingest-box.txt +++ b/requirements/ingest-box.txt @@ -23,7 +23,7 @@ charset-normalizer==3.2.0 # requests cryptography==41.0.3 # via boxsdk -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-box.in # boxfs diff --git a/requirements/ingest-delta-table.in b/requirements/ingest-delta-table.in index 09703a9372..a60c0d52ef 100644 --- a/requirements/ingest-delta-table.in +++ b/requirements/ingest-delta-table.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt deltalake -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-delta-table.txt b/requirements/ingest-delta-table.txt index b9eb20d1af..0da49842e5 100644 --- a/requirements/ingest-delta-table.txt +++ b/requirements/ingest-delta-table.txt @@ -6,7 +6,7 @@ # deltalake==0.10.2 # via -r requirements/ingest-delta-table.in -fsspec==2023.9.0 +fsspec==2023.9.1 # via -r requirements/ingest-delta-table.in numpy==1.24.4 # via diff --git a/requirements/ingest-dropbox.in b/requirements/ingest-dropbox.in index 365f182534..b6befc9bd6 100644 --- a/requirements/ingest-dropbox.in +++ b/requirements/ingest-dropbox.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt dropboxdrivefs -fsspec \ No newline at end of file +fsspec==2023.9.1 \ No newline at end of file diff --git a/requirements/ingest-dropbox.txt b/requirements/ingest-dropbox.txt index d76cd2f172..b565096026 100644 --- a/requirements/ingest-dropbox.txt +++ b/requirements/ingest-dropbox.txt @@ -17,7 +17,7 @@ dropbox==11.36.2 # via dropboxdrivefs dropboxdrivefs==1.3.1 # via -r requirements/ingest-dropbox.in -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-dropbox.in # dropboxdrivefs diff --git a/requirements/ingest-gcs.in b/requirements/ingest-gcs.in index de522c3f43..8f63397360 100644 --- a/requirements/ingest-gcs.in +++ b/requirements/ingest-gcs.in @@ -1,5 +1,5 @@ -c constraints.in -c base.txt gcsfs -fsspec +fsspec==2023.9.1 bs4 diff --git a/requirements/ingest-gcs.txt b/requirements/ingest-gcs.txt index 6bd792a4ac..44c8e081b0 100644 --- a/requirements/ingest-gcs.txt +++ b/requirements/ingest-gcs.txt @@ -36,11 +36,11 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-gcs.in # gcsfs -gcsfs==2023.9.0 +gcsfs==2023.9.1 # via -r requirements/ingest-gcs.in google-api-core==2.11.1 # via diff --git a/requirements/ingest-s3.in b/requirements/ingest-s3.in index c848714f96..f654805ee4 100644 --- a/requirements/ingest-s3.in +++ b/requirements/ingest-s3.in @@ -1,4 +1,4 @@ -c constraints.in -c base.txt s3fs -fsspec +fsspec==2023.9.1 diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index 41b66ac491..2f079fa29b 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -28,7 +28,7 @@ frozenlist==1.4.0 # via # aiohttp # aiosignal -fsspec==2023.9.0 +fsspec==2023.9.1 # via # -r requirements/ingest-s3.in # s3fs @@ -44,7 +44,7 @@ multidict==6.0.4 # yarl python-dateutil==2.8.2 # via botocore -s3fs==2023.9.0 +s3fs==2023.9.1 # via -r requirements/ingest-s3.in six==1.16.0 # via python-dateutil diff --git a/requirements/test.txt b/requirements/test.txt index 91d93117b5..4f5554e811 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -97,7 +97,7 @@ requests==2.31.0 # via # -c requirements/base.txt # label-studio-sdk -ruff==0.0.289 +ruff==0.0.290 # via -r requirements/test.in six==1.16.0 # via python-dateutil diff --git a/scripts/install-paddleocr.sh b/scripts/install-paddleocr.sh new file mode 100755 index 0000000000..8db79acfb1 --- /dev/null +++ b/scripts/install-paddleocr.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# aarch64 requires a custom build of paddlepaddle +if [ "${ARCH}" = "aarch64" ]; then + python3 -m pip install unstructured.paddlepaddle; +else + python3 -m pip install paddlepaddle; +fi +python3 -m pip install unstructured.paddleocr diff --git a/setup.py b/setup.py index 597d6f84b3..407d94fdc9 100644 --- a/setup.py +++ b/setup.py @@ -153,6 +153,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List # Legacy extra requirements "huggingface": load_requirements("requirements/huggingface.in"), "local-inference": all_doc_reqs, + "paddleocr": load_requirements("requirements/extra-paddleocr.in"), }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt"]},