diff --git a/README.md b/README.md index 1703e97f4..8e721f12d 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that t You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used. ``` -curl -X 'POST' +curl -X 'POST' \ 'https://api.unstructured.io/general/v0/general' \ -H 'accept: application/json' \ -H 'Content-Type: multipart/form-data' \ @@ -176,6 +176,23 @@ curl -X 'POST' | jq -C . | less -R ``` +#### Gzipped files + +You can send gzipped file and api will un-gzip it. + +``` +curl -X 'POST' \ + 'https://api.unstructured.io/general/v0/general' \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -F 'gz_uncompressed_content_type=application/pdf' \ + -F 'files=@sample-docs/layout-parser-paper.pdf.gz' +``` + +If field `gz_uncompressed_content_type` is set, the API will use its value as content-type of all files +after uncompressing the .gz files that are sent in single batch. If not set, the API will use +various heuristics to detect the filetypes after uncompressing from .gz. + #### XML Tags When processing XML documents, set the `xml_keep_tags` parameter to `true` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags. diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 57deeaada..3e3393872 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -749,11 +749,11 @@ def general_partition( chunking_strategy = _validate_chunking_strategy(form_params.chunking_strategy) # -- unzip any uploaded files that need it -- - for file_index in range(len(files)): - if files[file_index].content_type == "application/gzip": - files[file_index] = ungz_file( - files[file_index], form_params.gz_uncompressed_content_type - ) + for idx, file in enumerate(files): + is_content_type_gz = file.content_type == "application/gzip" + is_extension_gz = file.filename and file.filename.endswith(".gz") + if is_content_type_gz or is_extension_gz: + files[idx] = ungz_file(file, form_params.gz_uncompressed_content_type) def response_generator(is_multipart: bool): for file in files: diff --git a/requirements/base.txt b/requirements/base.txt index 1e9d118f4..8407de450 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -19,6 +19,7 @@ beautifulsoup4==4.12.3 certifi==2024.2.2 # via # requests + # unstructured # unstructured-client cffi==1.16.0 # via cryptography @@ -28,6 +29,7 @@ charset-normalizer==3.3.2 # via # pdfminer-six # requests + # unstructured # unstructured-client click==8.1.3 # via @@ -38,13 +40,15 @@ coloredlogs==15.0.1 # via onnxruntime contourpy==1.2.0 # via matplotlib -cryptography==42.0.4 +cryptography==42.0.5 # via pdfminer-six cycler==0.12.1 # via matplotlib dataclasses-json==0.6.4 - # via unstructured -dataclasses-json-speakeasy==0.5.11 + # via + # unstructured + # unstructured-client +deepdiff==6.7.1 # via unstructured-client deprecated==1.2.14 # via pikepdf @@ -56,7 +60,7 @@ et-xmlfile==1.1.0 # via openpyxl exceptiongroup==1.2.0 # via anyio -fastapi==0.109.2 +fastapi==0.110.0 # via -r requirements/base.in filelock==3.13.1 # via @@ -65,7 +69,7 @@ filelock==3.13.1 # transformers filetype==1.2.0 # via unstructured -flatbuffers==23.5.26 +flatbuffers==24.3.7 # via onnxruntime fonttools==4.49.0 # via matplotlib @@ -75,7 +79,7 @@ fsspec==2024.2.0 # torch h11==0.14.0 # via uvicorn -huggingface-hub==0.20.3 +huggingface-hub==0.21.4 # via # timm # tokenizers @@ -112,10 +116,9 @@ markdown==3.5.2 # via unstructured markupsafe==2.1.5 # via jinja2 -marshmallow==3.20.2 +marshmallow==3.21.1 # via # dataclasses-json - # dataclasses-json-speakeasy # unstructured-client matplotlib==3.8.3 # via pycocotools @@ -163,7 +166,9 @@ opencv-python==4.9.0.80 # unstructured-inference openpyxl==3.1.2 # via unstructured -packaging==23.2 +ordered-set==4.1.0 + # via deepdiff +packaging==24.0 # via # huggingface-hub # marshmallow @@ -174,7 +179,7 @@ packaging==23.2 # transformers # unstructured-client # unstructured-pytesseract -pandas==2.2.0 +pandas==2.2.1 # via # layoutparser # unstructured @@ -182,11 +187,11 @@ pdf2image==1.17.0 # via # layoutparser # unstructured -pdfminer-six==20221105 +pdfminer-six==20231228 # via # pdfplumber # unstructured -pdfplumber==0.10.4 +pdfplumber==0.11.0 # via layoutparser pikepdf==8.13.0 # via unstructured @@ -218,23 +223,24 @@ pycparser==2.21 # via cffi pycryptodome==3.20.0 # via -r requirements/base.in -pydantic==2.6.1 +pydantic==2.6.4 # via fastapi -pydantic-core==2.16.2 +pydantic-core==2.16.3 # via pydantic pypandoc==1.13 # via unstructured -pyparsing==3.1.1 +pyparsing==3.1.2 # via matplotlib -pypdf==4.0.2 +pypdf==4.1.0 # via # -r requirements/base.in # unstructured -pypdfium2==4.27.0 + # unstructured-client +pypdfium2==4.28.0 # via pdfplumber pytesseract==0.3.10 # via layoutparser -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # matplotlib # pandas @@ -258,7 +264,7 @@ pyyaml==6.0.1 # omegaconf # timm # transformers -rapidfuzz==3.6.1 +rapidfuzz==3.6.2 # via # unstructured # unstructured-inference @@ -287,7 +293,7 @@ six==1.16.0 # langdetect # python-dateutil # unstructured-client -sniffio==1.3.0 +sniffio==1.3.1 # via anyio soupsieve==2.5 # via beautifulsoup4 @@ -322,7 +328,7 @@ tqdm==4.66.2 # transformers transformers==4.37.1 # via unstructured-inference -typing-extensions==4.9.0 +typing-extensions==4.10.0 # via # anyio # fastapi @@ -339,13 +345,14 @@ typing-extensions==4.9.0 typing-inspect==0.9.0 # via # dataclasses-json - # dataclasses-json-speakeasy # unstructured-client tzdata==2024.1 # via pandas -unstructured[local-inference]==0.12.4 - # via -r requirements/base.in -unstructured-client==0.18.0 +unstructured[local-inference]==0.12.5 + # via + # -r requirements/base.in + # unstructured +unstructured-client==0.21.1 # via unstructured unstructured-inference==0.7.23 # via unstructured @@ -355,7 +362,7 @@ urllib3==2.2.1 # via # requests # unstructured-client -uvicorn==0.27.1 +uvicorn==0.28.0 # via -r requirements/base.in wrapt==1.16.0 # via diff --git a/requirements/test.in b/requirements/test.in index 6ad55c2d9..c507ed49d 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -11,3 +11,4 @@ pytest-mock nbdev jupyter httpx +deepdiff diff --git a/requirements/test.txt b/requirements/test.txt index 2d9b79e89..b94dd2b69 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -92,9 +92,11 @@ contourpy==1.2.0 # via # -r requirements/base.txt # matplotlib -coverage[toml]==7.4.2 - # via pytest-cov -cryptography==42.0.4 +coverage[toml]==7.4.3 + # via + # coverage + # pytest-cov +cryptography==42.0.5 # via # -r requirements/base.txt # pdfminer-six @@ -106,14 +108,16 @@ dataclasses-json==0.6.4 # via # -r requirements/base.txt # unstructured -dataclasses-json-speakeasy==0.5.11 - # via - # -r requirements/base.txt # unstructured-client debugpy==1.8.1 # via ipykernel decorator==5.1.1 # via ipython +deepdiff==6.7.1 + # via + # -r requirements/base.txt + # -r requirements/test.in + # unstructured-client defusedxml==0.7.1 # via nbconvert deprecated==1.2.14 @@ -142,7 +146,7 @@ execnb==0.1.5 # via nbdev executing==2.0.1 # via stack-data -fastapi==0.109.2 +fastapi==0.110.0 # via -r requirements/base.txt fastcore==1.5.29 # via @@ -163,7 +167,7 @@ filetype==1.2.0 # unstructured flake8==7.0.0 # via -r requirements/test.in -flatbuffers==23.5.26 +flatbuffers==24.3.7 # via # -r requirements/base.txt # onnxruntime @@ -191,7 +195,7 @@ httpx==0.27.0 # via # -r requirements/test.in # jupyterlab -huggingface-hub==0.20.3 +huggingface-hub==0.21.4 # via # -r requirements/base.txt # timm @@ -216,14 +220,14 @@ iopath==0.1.10 # via # -r requirements/base.txt # layoutparser -ipykernel==6.29.2 +ipykernel==6.29.3 # via # ipywidgets # jupyter # jupyter-console # jupyterlab # qtconsole -ipython==8.22.1 +ipython==8.22.2 # via # execnb # ipykernel @@ -249,7 +253,7 @@ joblib==1.3.2 # via # -r requirements/base.txt # nltk -json5==0.9.17 +json5==0.9.22 # via jupyterlab-server jsonpath-python==1.0.6 # via @@ -266,7 +270,7 @@ jsonschema-specifications==2023.12.1 # via jsonschema jupyter==1.0.0 # via -r requirements/test.in -jupyter-client==8.6.0 +jupyter-client==8.6.1 # via # ipykernel # jupyter-console @@ -275,7 +279,7 @@ jupyter-client==8.6.0 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.7.1 +jupyter-core==5.7.2 # via # ipykernel # jupyter-client @@ -286,24 +290,24 @@ jupyter-core==5.7.1 # nbconvert # nbformat # qtconsole -jupyter-events==0.9.0 +jupyter-events==0.9.1 # via jupyter-server -jupyter-lsp==2.2.2 +jupyter-lsp==2.2.4 # via jupyterlab -jupyter-server==2.12.5 +jupyter-server==2.13.0 # via # jupyter-lsp # jupyterlab # jupyterlab-server # notebook # notebook-shim -jupyter-server-terminals==0.5.2 +jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.1.2 +jupyterlab==4.1.4 # via notebook jupyterlab-pygments==0.3.0 # via nbconvert -jupyterlab-server==2.25.3 +jupyterlab-server==2.25.4 # via # jupyterlab # notebook @@ -320,6 +324,7 @@ langdetect==1.0.9 layoutparser[layoutmodels,tesseract]==0.3.4 # via # -r requirements/base.txt + # layoutparser # unstructured-inference lxml==5.1.0 # via @@ -337,11 +342,10 @@ markupsafe==2.1.5 # -r requirements/base.txt # jinja2 # nbconvert -marshmallow==3.20.2 +marshmallow==3.21.1 # via # -r requirements/base.txt # dataclasses-json - # dataclasses-json-speakeasy # unstructured-client matplotlib==3.8.3 # via @@ -363,7 +367,7 @@ msg-parser==1.2.0 # via # -r requirements/base.txt # unstructured -mypy==1.8.0 +mypy==1.9.0 # via -r requirements/test.in mypy-extensions==1.0.0 # via @@ -374,13 +378,13 @@ mypy-extensions==1.0.0 # unstructured-client nbclient==0.9.0 # via nbconvert -nbconvert==7.16.1 +nbconvert==7.16.2 # via # jupyter # jupyter-server nbdev==2.3.13 # via -r requirements/test.in -nbformat==5.9.2 +nbformat==5.10.2 # via # jupyter-server # nbclient @@ -396,7 +400,7 @@ nltk==3.8.1 # via # -r requirements/base.txt # unstructured -notebook==7.1.0 +notebook==7.1.1 # via jupyter notebook-shim==0.2.4 # via @@ -443,9 +447,13 @@ openpyxl==3.1.2 # via # -r requirements/base.txt # unstructured +ordered-set==4.1.0 + # via + # -r requirements/base.txt + # deepdiff overrides==7.7.0 # via jupyter-server -packaging==23.2 +packaging==24.0 # via # -r requirements/base.txt # black @@ -468,7 +476,7 @@ packaging==23.2 # transformers # unstructured-client # unstructured-pytesseract -pandas==2.2.0 +pandas==2.2.1 # via # -r requirements/base.txt # layoutparser @@ -484,12 +492,12 @@ pdf2image==1.17.0 # -r requirements/base.txt # layoutparser # unstructured -pdfminer-six==20221105 +pdfminer-six==20231228 # via # -r requirements/base.txt # pdfplumber # unstructured -pdfplumber==0.10.4 +pdfplumber==0.11.0 # via # -r requirements/base.txt # layoutparser @@ -559,11 +567,11 @@ pycparser==2.21 # cffi pycryptodome==3.20.0 # via -r requirements/base.txt -pydantic==2.6.1 +pydantic==2.6.4 # via # -r requirements/base.txt # fastapi -pydantic-core==2.16.2 +pydantic-core==2.16.3 # via # -r requirements/base.txt # pydantic @@ -579,15 +587,16 @@ pypandoc==1.13 # via # -r requirements/base.txt # unstructured -pyparsing==3.1.1 +pyparsing==3.1.2 # via # -r requirements/base.txt # matplotlib -pypdf==4.0.2 +pypdf==4.1.0 # via # -r requirements/base.txt # unstructured -pypdfium2==4.27.0 + # unstructured-client +pypdfium2==4.28.0 # via # -r requirements/base.txt # pdfplumber @@ -595,7 +604,7 @@ pytesseract==0.3.10 # via # -r requirements/base.txt # layoutparser -pytest==8.0.1 +pytest==8.1.1 # via # pytest-cov # pytest-mock @@ -603,7 +612,7 @@ pytest-cov==4.1.0 # via -r requirements/test.in pytest-mock==3.12.0 # via -r requirements/test.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -r requirements/base.txt # arrow @@ -658,7 +667,7 @@ qtconsole==5.5.1 # via jupyter qtpy==2.4.1 # via qtconsole -rapidfuzz==3.6.1 +rapidfuzz==3.6.2 # via # -r requirements/base.txt # unstructured @@ -717,7 +726,7 @@ six==1.16.0 # python-dateutil # rfc3339-validator # unstructured-client -sniffio==1.3.0 +sniffio==1.3.1 # via # -r requirements/base.txt # anyio @@ -741,7 +750,7 @@ tabulate==0.9.0 # via # -r requirements/base.txt # unstructured -terminado==0.18.0 +terminado==0.18.1 # via # jupyter-server # jupyter-server-terminals @@ -790,7 +799,7 @@ tqdm==4.66.2 # iopath # nltk # transformers -traitlets==5.14.1 +traitlets==5.14.2 # via # comm # ipykernel @@ -811,9 +820,9 @@ transformers==4.37.1 # via # -r requirements/base.txt # unstructured-inference -types-python-dateutil==2.8.19.20240106 +types-python-dateutil==2.8.19.20240311 # via arrow -typing-extensions==4.9.0 +typing-extensions==4.10.0 # via # -r requirements/base.txt # anyio @@ -835,15 +844,16 @@ typing-inspect==0.9.0 # via # -r requirements/base.txt # dataclasses-json - # dataclasses-json-speakeasy # unstructured-client tzdata==2024.1 # via # -r requirements/base.txt # pandas -unstructured[local-inference]==0.12.4 - # via -r requirements/base.txt -unstructured-client==0.18.0 +unstructured[local-inference]==0.12.5 + # via + # -r requirements/base.txt + # unstructured +unstructured-client==0.21.1 # via # -r requirements/base.txt # unstructured @@ -862,7 +872,7 @@ urllib3==2.2.1 # -r requirements/base.txt # requests # unstructured-client -uvicorn==0.27.1 +uvicorn==0.28.0 # via -r requirements/base.txt watchdog==4.0.0 # via nbdev @@ -876,7 +886,7 @@ webencodings==0.5.1 # tinycss2 websocket-client==1.7.0 # via jupyter-server -wheel==0.42.0 +wheel==0.43.0 # via astunparse widgetsnbextension==4.0.10 # via ipywidgets diff --git a/scripts/smoketest.py b/scripts/smoketest.py index 342ce62f9..7096031b5 100644 --- a/scripts/smoketest.py +++ b/scripts/smoketest.py @@ -1,11 +1,15 @@ +import io import os import time +import gzip +import shutil from pathlib import Path +from typing import List, Optional +import tempfile import pytest import requests import pandas as pd -import io API_URL = "http://localhost:8000/general/v0/general" # NOTE(rniko): Skip inference tests if we're running on an emulated architecture @@ -13,25 +17,34 @@ def send_document( - filename, - content_type, - strategy="auto", - output_format="application/json", - pdf_infer_table_structure="false", + filenames: List[str], + filenames_gzipped: Optional[List[str]] = None, + content_type: str = "", + strategy: str = "auto", + output_format: str = "application/json", + pdf_infer_table_structure: str = "false", + uncompressed_content_type: str = "", ): - # Note: `content_type` is not passed into request since fast API will overwrite it. - if str(filename).endswith(".gz"): - files = {"files": (str(filename), open(filename, "rb"), "application/gzip")} - else: - files = {"files": (str(filename), open(filename, "rb"))} + if filenames_gzipped is None: + filenames_gzipped = [] + files = [] + for filename in filenames: + files.append(("files", (str(filename), open(filename, "rb"), content_type))) + for filename in filenames_gzipped: + files.append(("files", (str(filename), open(filename, "rb"), "application/gzip"))) + + options = { + "strategy": strategy, + "output_format": output_format, + "pdf_infer_table_structure": pdf_infer_table_structure, + } + if uncompressed_content_type: + options["gz_uncompressed_content_type"] = uncompressed_content_type + return requests.post( API_URL, files=files, - data={ - "strategy": strategy, - "output_format": output_format, - "pdf_infer_table_structure": pdf_infer_table_structure, - }, + data=options, ) @@ -84,25 +97,100 @@ def send_document( ("layout-parser-paper.pdf.gz", "application/gzip"), ], ) -def test_happy_path(example_filename, content_type): +def test_happy_path(example_filename: str, content_type: str): """ For the files in sample-docs, verify that we get a 200 and some structured response """ - test_file = Path("sample-docs") / example_filename + test_file = str(Path("sample-docs") / example_filename) print(f"sending {content_type}") - json_response = send_document(test_file, content_type) + json_response = send_document(filenames=[test_file], content_type=content_type) assert json_response.status_code == 200 assert len(json_response.json()) > 0 assert len("".join(elem["text"] for elem in json_response.json())) > 20 - csv_response = send_document(test_file, content_type, output_format="text/csv") + csv_response = send_document( + filenames=[test_file], content_type=content_type, output_format="text/csv" + ) assert csv_response.status_code == 200 assert len(csv_response.text) > 0 df = pd.read_csv(io.StringIO(csv_response.text)) assert len(df) == len(json_response.json()) +@pytest.mark.parametrize("output_format", ["application/json", "text/csv"]) +@pytest.mark.parametrize( + "filenames_to_gzip, filenames_verbatim, uncompressed_content_type", + [ + (["fake-html.html"], [], "text/html"), + (["stanley-cups.csv"], [], "application/csv"), + (["fake.doc"], [], "application/msword"), + # compressed and uncompressed + (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], "application/pdf"), + (["fake-email.eml"], ["fake-email-image-embedded.eml"], "message/rfc822"), + # compressed and uncompressed + # empty content-type means that API should detect filetype after decompressing. + (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], ""), + (["fake-email.eml"], ["fake-email-image-embedded.eml"], ""), + ], +) +def test_gzip_sending( + output_format: str, + filenames_to_gzip: List[str], + filenames_verbatim: List[str], + uncompressed_content_type: str, +): + temp_files = {} + + for filename in filenames_to_gzip: + gz_file_extension = f"{Path(filename).suffix}.gz" + temp_file = tempfile.NamedTemporaryFile(suffix=gz_file_extension) + full_path = Path("sample-docs") / filename + gzip_file(str(full_path), temp_file.name) + temp_files[filename] = temp_file + filenames_gzipped = [temp_file.name for temp_file in temp_files.values()] + + filenames = [] + for filename in filenames_verbatim: + filenames.append(str(Path("sample-docs") / filename)) + + json_response = send_document( + filenames, + filenames_gzipped, + content_type=uncompressed_content_type, + uncompressed_content_type=uncompressed_content_type, + ) + assert json_response.status_code == 200, json_response.text + json_content = json_response.json() + assert len(json_content) > 0 + if len(filenames_gzipped + filenames) > 1: + for file in json_content: + assert len("".join(elem["text"] for elem in file)) > 20 + else: + assert len("".join(elem["text"] for elem in json_content)) > 20 + + csv_response = send_document( + filenames, + filenames_gzipped, + content_type=uncompressed_content_type, + uncompressed_content_type=uncompressed_content_type, + output_format="text/csv", + ) + assert csv_response.status_code == 200 + assert len(csv_response.text) > 0 + df = pd.read_csv(io.StringIO(csv_response.text)) + if len(filenames_gzipped + filenames) > 1: + json_size = 0 + for file in json_content: + json_size += len(file) + assert len(df) == json_size + else: + assert len(df) == len(json_content) + + for filename in filenames_to_gzip: + temp_files[filename].close() + + @pytest.mark.skipif(skip_inference_tests, reason="emulated architecture") def test_strategy_performance(): """ @@ -110,22 +198,24 @@ def test_strategy_performance(): is significantly faster than the hi_res strategy """ performance_ratio = 4 - test_file = Path("sample-docs") / "layout-parser-paper.pdf" + test_file = str(Path("sample-docs") / "layout-parser-paper.pdf") start_time = time.monotonic() - response = send_document(test_file, content_type="application/pdf", strategy="hi_res") + response = send_document( + filenames=[test_file], content_type="application/pdf", strategy="hi_res" + ) hi_res_time = time.monotonic() - start_time assert response.status_code == 200 start_time = time.monotonic() - response = send_document(test_file, content_type="application/pdf", strategy="auto") + response = send_document(filenames=[test_file], content_type="application/pdf", strategy="auto") auto_time = time.monotonic() - start_time assert response.status_code == 200 assert hi_res_time > performance_ratio * auto_time start_time = time.monotonic() - response = send_document(test_file, content_type="application/pdf", strategy="fast") + response = send_document(filenames=[test_file], content_type="application/pdf", strategy="fast") fast_time = time.monotonic() - start_time assert response.status_code == 200 @@ -142,14 +232,14 @@ def test_strategy_performance(): ("hi_res", "False", 0), ], ) -def test_table_support(strategy, pdf_infer_table_structure, expected_table_num): +def test_table_support(strategy: str, pdf_infer_table_structure: str, expected_table_num: int): """ Test that table extraction works on hi_res strategy """ - test_file = Path("sample-docs") / "layout-parser-paper.pdf" + test_file = str(Path("sample-docs") / "layout-parser-paper.pdf") response = send_document( - test_file, - "application/pdf", + filenames=[test_file], + content_type="application/pdf", strategy=strategy, pdf_infer_table_structure=pdf_infer_table_structure, ) @@ -166,3 +256,9 @@ def test_table_support(strategy, pdf_infer_table_structure, expected_table_num): # Note(austin) - table output has changed - this line isn't returned # assert "Layouts of scanned modern magazines and scientific reports" in extracted_tables[0] assert "Layouts of history" in extracted_tables[0] + + +def gzip_file(in_filepath: str, out_filepath: str): + with open(in_filepath, "rb") as f_in: + with gzip.open(out_filepath, "wb", compresslevel=1) as f_out: + shutil.copyfileobj(f_in, f_out) diff --git a/scripts/version-sync.sh b/scripts/version-sync.sh index e8888efa5..4a62d26e3 100755 --- a/scripts/version-sync.sh +++ b/scripts/version-sync.sh @@ -13,12 +13,12 @@ function usage { } function getopts-extra () { - declare i=1 + declare -i i=1 # if the next argument is not an option, then append it to array OPTARG while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do OPTARG[i]=${!OPTIND} - i+=1 - OPTIND+=1 + ((i += 1)) + ((OPTIND += 1)) done } diff --git a/test_general/api/test_gzip.py b/test_general/api/test_gzip.py new file mode 100644 index 000000000..84459a6c3 --- /dev/null +++ b/test_general/api/test_gzip.py @@ -0,0 +1,164 @@ +import gzip +import shutil +import io +import tempfile +from pathlib import Path +from typing import List + +import httpx +import pandas as pd +import pytest +from fastapi.testclient import TestClient +from deepdiff import DeepDiff + +from prepline_general.api.app import app + +MAIN_API_ROUTE = "general/v0/general" + + +@pytest.mark.parametrize("output_format", ["application/json", "text/csv"]) +@pytest.mark.parametrize( + "filenames_to_gzip, filenames_verbatim, uncompressed_content_type", + [ + (["fake-html.html"], [], "text/html"), + (["stanley-cups.csv"], [], "application/csv"), + (["fake.doc"], [], "application/msword"), + (["layout-parser-paper-fast.pdf"], [], "application/pdf"), + (["fake-email-attachment.eml", "fake-email.eml"], [], "message/rfc822"), + ( + ["fake-email-attachment.eml", "fake-email.eml", "announcement.eml"], + [], + "message/rfc822", + ), + (["layout-parser-paper-fast.pdf", "list-item-example.pdf"], [], "application/pdf"), + # now the same but without explicit content type + # to make the system guess the un-gzipped type based on content. + (["fake-html.html"], [], ""), + (["fake-email-attachment.eml", "fake-email.eml"], [], ""), + (["layout-parser-paper-fast.pdf", "list-item-example.pdf"], [], ""), + # mix of compressed and uncompressed + (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], "application/pdf"), + # mix of compressed and uncompressed, and guessing of content type + (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], ""), + # have to use OCR which is slow, so minimum cases + (["embedded-images-tables.jpg"], ["english-and-korean.png"], "image/png"), + (["embedded-images-tables.jpg"], ["english-and-korean.png"], ""), + ], +) +def test_gzipped_files_are_parsed_like_original( + output_format: str, + filenames_to_gzip: List[str], + filenames_verbatim: List[str], + uncompressed_content_type: str, +): + """ + Verify that API supports un-gzipping and correctly interprets gz_uncompressed_content_type, + by comparing response to directly parsing the same files. + The one thing which changes is the filenames in metadata, which have to be ignored. + """ + client = TestClient(app) + gz_options = { + "gz_uncompressed_content_type": ( + uncompressed_content_type if uncompressed_content_type else None + ), + "output_format": output_format, + } + response1 = get_gzipped_response( + client, filenames_to_gzip, filenames_verbatim, gz_options, uncompressed_content_type + ) + response2 = call_api( + client, + [], + filenames_to_gzip + filenames_verbatim, + uncompressed_content_type, + {"output_format": output_format}, + ) + compare_responses( + response1, response2, output_format, len(filenames_to_gzip + filenames_verbatim) + ) + + +def compare_responses( + response1: httpx.Response, response2: httpx.Response, output_format: str, files_count: int +) -> None: + if output_format == "application/json": + if files_count == 1: + exclude_regex_paths = r"root\[\d+\]\['metadata'\]\['filename'\]" + else: + exclude_regex_paths = r"root\[\d+\]\[\d+\]\['metadata'\]\['filename'\]" + diff = DeepDiff( + t1=response1.json(), + t2=response2.json(), + exclude_regex_paths=exclude_regex_paths, + ) + assert len(diff) == 0 + else: + df1 = pd.read_csv(io.StringIO(response1.text)) + df2 = pd.read_csv(io.StringIO(response2.text)) + diff = DeepDiff( + t1=df1.to_dict(), t2=df2.to_dict(), exclude_regex_paths=r"root\['filename'\]\[\d+\]" + ) + assert len(diff) == 0 + + +def call_api( + client: TestClient, + filenames_gzipped: List[str], + filenames_verbatim: List[str], + content_type: str, + options: dict, + samples_dir: str = "sample-docs", +) -> httpx.Response: + files = [] + for filename in filenames_gzipped: + full_path = Path(samples_dir) / filename + files.append(("files", (str(full_path), open(full_path, "rb"), "application/gzip"))) + + for filename in filenames_verbatim: + full_path = Path(samples_dir) / filename + files.append(("files", (str(full_path), open(full_path, "rb"), content_type))) + + response = client.post( + MAIN_API_ROUTE, + files=files, + data=options, + ) + assert response.status_code == 200, response.text + assert len(response.text) > 0 + return response + + +def get_gzipped_response( + client: TestClient, + filenames_to_gzip: List[str], + filenames_verbatim: List[str], + options: dict, + content_type: str, + samples_dir: str = "sample-docs", +) -> httpx.Response: + """ + G-zips the filenames_to_gzip into temporary .gz file and sends to API, + along with filenames_no_gzip. + """ + temp_files = {} + for filename in filenames_to_gzip: + gz_file_extension = f"{Path(filename).suffix}.gz" + temp_file = tempfile.NamedTemporaryFile(suffix=gz_file_extension) + full_path = Path(samples_dir) / filename + gzip_file(str(full_path), temp_file.name) + temp_files[filename] = temp_file + + filenames_gzipped = [temp_file.name for temp_file in temp_files.values()] + + response = call_api(client, filenames_gzipped, filenames_verbatim, content_type, options) + + for filename in filenames_to_gzip: + temp_files[filename].close() + + return response + + +def gzip_file(in_filepath: str, out_filepath: str): + with open(in_filepath, "rb") as f_in: + with gzip.open(out_filepath, "wb", compresslevel=1) as f_out: + shutil.copyfileobj(f_in, f_out)