diff --git a/CHANGELOG.md b/CHANGELOG.md index 63e90a04..aa229625 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.3.1 + +* Pin protobuf version to avoid errors +* Make paddleocr an extra again + ## 0.3.0 * Fix for text block detection diff --git a/requirements/dev.txt b/requirements/dev.txt index 4f083cfd..bf3b0f94 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -67,7 +67,7 @@ ipykernel==6.22.0 # nbclassic # notebook # qtconsole -ipython==8.11.0 +ipython==8.12.0 # via # -r requirements/dev.in # ipykernel @@ -143,7 +143,7 @@ matplotlib-inline==0.1.6 # ipython mistune==2.0.5 # via nbconvert -nbclassic==0.5.3 +nbclassic==0.5.4 # via notebook nbclient==0.7.2 # via nbconvert @@ -304,6 +304,8 @@ traitlets==5.9.0 # nbformat # notebook # qtconsole +typing-extensions==4.5.0 + # via ipython uri-template==1.2.0 # via jsonschema wcwidth==0.2.6 diff --git a/setup.py b/setup.py index 7de0e198..2735e7a8 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,22 @@ "opencv-python==4.6.0.66", "onnxruntime", "transformers", - 'unstructured.PaddleOCR ; platform_machine=="x86_64"', ], + extras_require={ + "tables": [ + 'unstructured.PaddleOCR ; platform_machine=="x86_64"', + # NOTE(crag): workaround issue for error output below + # ERROR test_unstructured/partition/test_common.py - TypeError: Descriptors cannot not + # be created directly. + # If this call came from a _pb2.py file, your generated code is out of date and must be + # regenerated with protoc >= 3.19.0. + # If you cannot immediately regenerate your protos, some other possible workarounds are: + # 1. Downgrade the protobuf package to 3.20.x or lower. + # 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python + # parsing and will be much slower). + "protobuf<3.21", + # NOTE(alan): Pin to get around error: undefined symbol: _dl_sym, version GLIBC_PRIVATE + "paddlepaddle>=2.4", + ] + }, ) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index fb9998eb..48f4028d 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.3.0" # pragma: no cover +__version__ = "0.3.1" # pragma: no cover diff --git a/unstructured_inference/models/paddle_ocr.py b/unstructured_inference/models/paddle_ocr.py index 3fee51e0..66941e2e 100644 --- a/unstructured_inference/models/paddle_ocr.py +++ b/unstructured_inference/models/paddle_ocr.py @@ -1,11 +1,11 @@ -from unstructured_paddleocr import PaddleOCR - -paddle_ocr: PaddleOCR = None +paddle_ocr = None # type: ignore def load_agent(): """Loads the PaddleOCR agent as a global variable to ensure that we only load it once.""" + from unstructured_paddleocr import PaddleOCR + global paddle_ocr paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", mkl_dnn=True, show_log=False)