From 94952e65f6358084631eccac9f9864ffcb66f2df Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 29 Mar 2023 20:04:30 -0500 Subject: [PATCH 1/6] Pin protobuf to avoid error --- requirements/base.txt | 6 ++++-- setup.py | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index d08b0a8d..f1cd5520 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -115,8 +115,10 @@ pillow==9.4.0 # torchvision portalocker==2.7.0 # via iopath -protobuf==4.22.1 - # via onnxruntime +protobuf==3.20.3 + # via + # onnxruntime + # unstructured-inference (setup.py) pycocotools==2.0.6 # via effdet pycparser==2.21 diff --git a/setup.py b/setup.py index 7de0e198..ade04dfc 100644 --- a/setup.py +++ b/setup.py @@ -60,5 +60,6 @@ "onnxruntime", "transformers", 'unstructured.PaddleOCR ; platform_machine=="x86_64"', + "protobuf==3.20.*", ], ) From b124e571a1a452d57721b1cd82a8e87a96bbaa4f Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 29 Mar 2023 20:09:40 -0500 Subject: [PATCH 2/6] Add pin note --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index ade04dfc..a2e1512e 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,8 @@ "onnxruntime", "transformers", 'unstructured.PaddleOCR ; platform_machine=="x86_64"', + # NOTE(alan): protobuf is required by onnxruntime, and causes errors when the latest version + # is used in conjunction with certain other libraries (tensorboard/flow for example). "protobuf==3.20.*", ], ) From 36a7cb778c9452ee60baeb7257dfd511e247fb7d Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 29 Mar 2023 20:11:22 -0500 Subject: [PATCH 3/6] Bump version and release --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63e90a04..64d1f145 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.1 + +* Pin protobuf version to avoid errors + ## 0.3.0 * Fix for text block detection diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index fb9998eb..48f4028d 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.3.0" # pragma: no cover +__version__ = "0.3.1" # pragma: no cover From 6b2d6104a128136f8ea72769254acb5e2607cda9 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 30 Mar 2023 11:45:23 -0500 Subject: [PATCH 4/6] Stop importing paddleocr unless needed --- unstructured_inference/models/paddle_ocr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unstructured_inference/models/paddle_ocr.py b/unstructured_inference/models/paddle_ocr.py index 3fee51e0..66941e2e 100644 --- a/unstructured_inference/models/paddle_ocr.py +++ b/unstructured_inference/models/paddle_ocr.py @@ -1,11 +1,11 @@ -from unstructured_paddleocr import PaddleOCR - -paddle_ocr: PaddleOCR = None +paddle_ocr = None # type: ignore def load_agent(): """Loads the PaddleOCR agent as a global variable to ensure that we only load it once.""" + from unstructured_paddleocr import PaddleOCR + global paddle_ocr paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", mkl_dnn=True, show_log=False) From 0d8723db583b8452091a7601bcb02dc2f7a53dbd Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 30 Mar 2023 12:32:16 -0500 Subject: [PATCH 5/6] make paddleocr optional --- CHANGELOG.md | 1 + setup.py | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 64d1f145..aa229625 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.3.1 * Pin protobuf version to avoid errors +* Make paddleocr an extra again ## 0.3.0 diff --git a/setup.py b/setup.py index a2e1512e..2735e7a8 100644 --- a/setup.py +++ b/setup.py @@ -59,9 +59,22 @@ "opencv-python==4.6.0.66", "onnxruntime", "transformers", - 'unstructured.PaddleOCR ; platform_machine=="x86_64"', - # NOTE(alan): protobuf is required by onnxruntime, and causes errors when the latest version - # is used in conjunction with certain other libraries (tensorboard/flow for example). - "protobuf==3.20.*", ], + extras_require={ + "tables": [ + 'unstructured.PaddleOCR ; platform_machine=="x86_64"', + # NOTE(crag): workaround issue for error output below + # ERROR test_unstructured/partition/test_common.py - TypeError: Descriptors cannot not + # be created directly. + # If this call came from a _pb2.py file, your generated code is out of date and must be + # regenerated with protoc >= 3.19.0. + # If you cannot immediately regenerate your protos, some other possible workarounds are: + # 1. Downgrade the protobuf package to 3.20.x or lower. + # 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python + # parsing and will be much slower). + "protobuf<3.21", + # NOTE(alan): Pin to get around error: undefined symbol: _dl_sym, version GLIBC_PRIVATE + "paddlepaddle>=2.4", + ] + }, ) From 080965699d5a086d92ac7e0c9b9e87f9bb0be2d3 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Thu, 30 Mar 2023 12:36:36 -0500 Subject: [PATCH 6/6] update reqs --- requirements/base.txt | 6 ++---- requirements/dev.txt | 6 ++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index f1cd5520..d08b0a8d 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -115,10 +115,8 @@ pillow==9.4.0 # torchvision portalocker==2.7.0 # via iopath -protobuf==3.20.3 - # via - # onnxruntime - # unstructured-inference (setup.py) +protobuf==4.22.1 + # via onnxruntime pycocotools==2.0.6 # via effdet pycparser==2.21 diff --git a/requirements/dev.txt b/requirements/dev.txt index 4f083cfd..bf3b0f94 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -67,7 +67,7 @@ ipykernel==6.22.0 # nbclassic # notebook # qtconsole -ipython==8.11.0 +ipython==8.12.0 # via # -r requirements/dev.in # ipykernel @@ -143,7 +143,7 @@ matplotlib-inline==0.1.6 # ipython mistune==2.0.5 # via nbconvert -nbclassic==0.5.3 +nbclassic==0.5.4 # via notebook nbclient==0.7.2 # via nbconvert @@ -304,6 +304,8 @@ traitlets==5.9.0 # nbformat # notebook # qtconsole +typing-extensions==4.5.0 + # via ipython uri-template==1.2.0 # via jsonschema wcwidth==0.2.6