Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5ef6227
test: add first test.
hubert-rutkowski85 Feb 28, 2024
af5270a
test: extend test for two cases of output_format, and situation when …
hubert-rutkowski85 Feb 28, 2024
986be77
test: lint fix.
hubert-rutkowski85 Feb 28, 2024
2d156c1
test: multiple compressed files at once.
hubert-rutkowski85 Feb 28, 2024
d3ffd61
test: refactor structure to one function which handles both single an…
hubert-rutkowski85 Feb 29, 2024
d57a863
test: final structure, which tests everything.
hubert-rutkowski85 Feb 29, 2024
dde2171
test: improve code quality.
hubert-rutkowski85 Feb 29, 2024
77ae648
doc: add example curl for gzips in readme.
hubert-rutkowski85 Feb 29, 2024
54c1de4
chore: black formatting.
hubert-rutkowski85 Feb 29, 2024
05ad722
fix: make the backend be better at recognizing gzip files not only th…
hubert-rutkowski85 Feb 29, 2024
098b9e6
refactor: add typings to smoke tests. Improve gz test so that it does…
hubert-rutkowski85 Feb 29, 2024
5f43c88
test: smoke tests for gzip.
hubert-rutkowski85 Feb 29, 2024
4a347a5
ci: fix for mypy.
hubert-rutkowski85 Feb 29, 2024
06312bd
test: add one more case to smoketests.
hubert-rutkowski85 Feb 29, 2024
0e28379
ci: temporary fix for tests (WILL BE REVERTED)
hubert-rutkowski85 Feb 29, 2024
9a6bad1
chore: pip-compile with proper venv
hubert-rutkowski85 Mar 4, 2024
7b0dee1
refactor: improve loop structure. Grammar fix.
hubert-rutkowski85 Mar 4, 2024
153fb56
chore: review fix.
hubert-rutkowski85 Mar 4, 2024
96c57a0
Merge branch 'main' into 86-ability-to-accept-gzip-compressed-files-h…
hubert-rutkowski85 Mar 4, 2024
27cacf5
chore: bring back requirements from main
hubert-rutkowski85 Mar 12, 2024
0c09dd9
Merge branch 'main' into 86-ability-to-accept-gzip-compressed-files-h…
hubert-rutkowski85 Mar 12, 2024
d20bb91
recompile requirements
awalker4 Mar 12, 2024
307d24c
ci: fix shellcheck lint
hubert-rutkowski85 Mar 13, 2024
4e4d1bb
ci: fix gzip test
hubert-rutkowski85 Mar 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ for PDFs and Images, which are `pdf`, `jpg` and `png`. Again, please note that t
You can specify the encoding to use to decode the text input. If no value is provided, utf-8 will be used.

```
curl -X 'POST'
curl -X 'POST' \
'https://api.unstructured.io/general/v0/general' \
-H 'accept: application/json' \
-H 'Content-Type: multipart/form-data' \
Expand All @@ -176,6 +176,23 @@ curl -X 'POST'
| jq -C . | less -R
```

#### Gzipped files

You can send gzipped file and api will un-gzip it.

```
curl -X 'POST' \
'https://api.unstructured.io/general/v0/general' \
-H 'accept: application/json' \
-H 'Content-Type: multipart/form-data' \
-F 'gz_uncompressed_content_type=application/pdf' \
-F 'files=@sample-docs/layout-parser-paper.pdf.gz'
```

If field `gz_uncompressed_content_type` is set, the API will use its value as content-type of all files
after uncompressing the .gz files that are sent in single batch. If not set, the API will use
various heuristics to detect the filetypes after uncompressing from .gz.

#### XML Tags

When processing XML documents, set the `xml_keep_tags` parameter to `true` to retain the XML tags in the output. If not specified, it will simply extract the text from within the tags.
Expand Down
10 changes: 5 additions & 5 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,11 +749,11 @@ def general_partition(
chunking_strategy = _validate_chunking_strategy(form_params.chunking_strategy)

# -- unzip any uploaded files that need it --
for file_index in range(len(files)):
if files[file_index].content_type == "application/gzip":
files[file_index] = ungz_file(
files[file_index], form_params.gz_uncompressed_content_type
)
for idx, file in enumerate(files):
is_content_type_gz = file.content_type == "application/gzip"
is_extension_gz = file.filename and file.filename.endswith(".gz")
if is_content_type_gz or is_extension_gz:
files[idx] = ungz_file(file, form_params.gz_uncompressed_content_type)

def response_generator(is_multipart: bool):
for file in files:
Expand Down
59 changes: 33 additions & 26 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ beautifulsoup4==4.12.3
certifi==2024.2.2
# via
# requests
# unstructured
# unstructured-client
cffi==1.16.0
# via cryptography
Expand All @@ -28,6 +29,7 @@ charset-normalizer==3.3.2
# via
# pdfminer-six
# requests
# unstructured
# unstructured-client
click==8.1.3
# via
Expand All @@ -38,13 +40,15 @@ coloredlogs==15.0.1
# via onnxruntime
contourpy==1.2.0
# via matplotlib
cryptography==42.0.4
cryptography==42.0.5
# via pdfminer-six
cycler==0.12.1
# via matplotlib
dataclasses-json==0.6.4
# via unstructured
dataclasses-json-speakeasy==0.5.11
# via
# unstructured
# unstructured-client
deepdiff==6.7.1
# via unstructured-client
deprecated==1.2.14
# via pikepdf
Expand All @@ -56,7 +60,7 @@ et-xmlfile==1.1.0
# via openpyxl
exceptiongroup==1.2.0
# via anyio
fastapi==0.109.2
fastapi==0.110.0
# via -r requirements/base.in
filelock==3.13.1
# via
Expand All @@ -65,7 +69,7 @@ filelock==3.13.1
# transformers
filetype==1.2.0
# via unstructured
flatbuffers==23.5.26
flatbuffers==24.3.7
# via onnxruntime
fonttools==4.49.0
# via matplotlib
Expand All @@ -75,7 +79,7 @@ fsspec==2024.2.0
# torch
h11==0.14.0
# via uvicorn
huggingface-hub==0.20.3
huggingface-hub==0.21.4
# via
# timm
# tokenizers
Expand Down Expand Up @@ -112,10 +116,9 @@ markdown==3.5.2
# via unstructured
markupsafe==2.1.5
# via jinja2
marshmallow==3.20.2
marshmallow==3.21.1
# via
# dataclasses-json
# dataclasses-json-speakeasy
# unstructured-client
matplotlib==3.8.3
# via pycocotools
Expand Down Expand Up @@ -163,7 +166,9 @@ opencv-python==4.9.0.80
# unstructured-inference
openpyxl==3.1.2
# via unstructured
packaging==23.2
ordered-set==4.1.0
# via deepdiff
packaging==24.0
# via
# huggingface-hub
# marshmallow
Expand All @@ -174,19 +179,19 @@ packaging==23.2
# transformers
# unstructured-client
# unstructured-pytesseract
pandas==2.2.0
pandas==2.2.1
# via
# layoutparser
# unstructured
pdf2image==1.17.0
# via
# layoutparser
# unstructured
pdfminer-six==20221105
pdfminer-six==20231228
# via
# pdfplumber
# unstructured
pdfplumber==0.10.4
pdfplumber==0.11.0
# via layoutparser
pikepdf==8.13.0
# via unstructured
Expand Down Expand Up @@ -218,23 +223,24 @@ pycparser==2.21
# via cffi
pycryptodome==3.20.0
# via -r requirements/base.in
pydantic==2.6.1
pydantic==2.6.4
# via fastapi
pydantic-core==2.16.2
pydantic-core==2.16.3
# via pydantic
pypandoc==1.13
# via unstructured
pyparsing==3.1.1
pyparsing==3.1.2
# via matplotlib
pypdf==4.0.2
pypdf==4.1.0
# via
# -r requirements/base.in
# unstructured
pypdfium2==4.27.0
# unstructured-client
pypdfium2==4.28.0
# via pdfplumber
pytesseract==0.3.10
# via layoutparser
python-dateutil==2.8.2
python-dateutil==2.9.0.post0
# via
# matplotlib
# pandas
Expand All @@ -258,7 +264,7 @@ pyyaml==6.0.1
# omegaconf
# timm
# transformers
rapidfuzz==3.6.1
rapidfuzz==3.6.2
# via
# unstructured
# unstructured-inference
Expand Down Expand Up @@ -287,7 +293,7 @@ six==1.16.0
# langdetect
# python-dateutil
# unstructured-client
sniffio==1.3.0
sniffio==1.3.1
# via anyio
soupsieve==2.5
# via beautifulsoup4
Expand Down Expand Up @@ -322,7 +328,7 @@ tqdm==4.66.2
# transformers
transformers==4.37.1
# via unstructured-inference
typing-extensions==4.9.0
typing-extensions==4.10.0
# via
# anyio
# fastapi
Expand All @@ -339,13 +345,14 @@ typing-extensions==4.9.0
typing-inspect==0.9.0
# via
# dataclasses-json
# dataclasses-json-speakeasy
# unstructured-client
tzdata==2024.1
# via pandas
unstructured[local-inference]==0.12.4
# via -r requirements/base.in
unstructured-client==0.18.0
unstructured[local-inference]==0.12.5
# via
# -r requirements/base.in
# unstructured
unstructured-client==0.21.1
# via unstructured
unstructured-inference==0.7.23
# via unstructured
Expand All @@ -355,7 +362,7 @@ urllib3==2.2.1
# via
# requests
# unstructured-client
uvicorn==0.27.1
uvicorn==0.28.0
# via -r requirements/base.in
wrapt==1.16.0
# via
Expand Down
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ pytest-mock
nbdev
jupyter
httpx
deepdiff
Loading