Skip to content

Commit

Permalink
feat: add google cloud storage connector (#746)
Browse files Browse the repository at this point in the history
  • Loading branch information
potter-potter committed Jun 21, 2023
1 parent 21c346d commit 3b472cb
Show file tree
Hide file tree
Showing 34 changed files with 514 additions and 54 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ jobs:
make install-ingest-s3
make install-ingest-azure
make install-ingest-discord
make install-ingest-gcs
make install-ingest-google-drive
make install-ingest-github
make install-ingest-gitlab
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ jobs:
make install-ingest-s3
make install-ingest-azure
make install-ingest-discord
make install-ingest-gcs
make install-ingest-google-drive
make install-ingest-github
make install-ingest-gitlab
Expand Down
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## 0.7.8-dev0

### Enhancements

* Adds recursive functionality to all fsspec connectors
* Adds generic --recursive ingest flag

### Features

* Adds Google Cloud Service connector

### Fixes

## 0.7.7

### Enhancements
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ include requirements/base.in
include requirements/huggingface.in
include requirements/local-inference.in
include requirements/ingest-s3.in
include requirements/ingest-gcs.in
include requirements/ingest-azure.in
include requirements/ingest-discord.in
include requirements/ingest-github.in
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ install-ingest-google-drive:
install-ingest-s3:
python3 -m pip install -r requirements/ingest-s3.txt

.PHONY: install-ingest-gcs
install-ingest-gcs:
python3 -m pip install -r requirements/ingest-gcs.txt

.PHONY: install-ingest-azure
install-ingest-azure:
python3 -m pip install -r requirements/ingest-azure.txt
Expand Down Expand Up @@ -117,6 +121,7 @@ pip-compile:
# sphinx docs looks for additional requirements
cp requirements/build.txt docs/requirements.txt
pip-compile --upgrade requirements/ingest-s3.in
pip-compile --upgrade requirements/ingest-gcs.in
pip-compile --upgrade requirements/ingest-azure.in
pip-compile --upgrade requirements/ingest-discord.in
pip-compile --upgrade requirements/ingest-reddit.in
Expand Down
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via sphinx
jinja2==3.1.2
# via sphinx
Expand Down
2 changes: 1 addition & 1 deletion examples/ingest/azure/ingest.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env bash

# Processes all the files from abfs://container1/ in azureunstructured1 account,
# using the `unstructured` library.
# using the `unstructured` library.

# Structured outputs are stored in azure-ingest-output/

Expand Down
16 changes: 16 additions & 0 deletions examples/ingest/google_cloud_storage/ingest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

# Processes several files in a nested folder structure from gs://utic-test-ingest-fixtures-public/
# through Unstructured's library in 2 processes.

# Structured outputs are stored in gcs-output/

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/../../.. || exit 1

PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url gs://utic-test-ingest-fixtures-public/ \
--structured-output-dir gcs-output \
--num-processes 2 \
--recursive \
--verbose
2 changes: 1 addition & 1 deletion examples/ingest/google_drive/ingest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--drive-service-account-key "<path to drive service account key>" \
--structured-output-dir google-drive-ingest-output \
--num-processes 2 \
--drive-recursive \
--recursive \
--verbose \
# --extension ".docx" # Ensures only .docx files are processed.

Expand Down
2 changes: 1 addition & 1 deletion examples/ingest/local/ingest.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--local-input-path example-docs \
--structured-output-dir local-ingest-output \
--num-processes 2 \
--local-recursive \
--recursive \
--verbose \

# Alternatively, you can call it using:
Expand Down
4 changes: 2 additions & 2 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#
anyio==3.7.0
# via httpcore
argilla==1.9.0
argilla==1.10.0
# via -r requirements/base.in
backoff==2.2.1
# via argilla
Expand Down Expand Up @@ -51,7 +51,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via markdown
joblib==1.2.0
# via nltk
Expand Down
2 changes: 1 addition & 1 deletion requirements/build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via sphinx
jinja2==3.1.2
# via sphinx
Expand Down
16 changes: 8 additions & 8 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ idna==3.4
# -c requirements/test.txt
# anyio
# jsonschema
importlib-metadata==6.6.0
importlib-metadata==6.7.0
# via
# -c requirements/base.txt
# jupyter-client
Expand Down Expand Up @@ -113,7 +113,7 @@ jinja2==3.1.2
# nbclassic
# nbconvert
# notebook
jsonpointer==2.3
jsonpointer==2.4
# via jsonschema
jsonschema[format-nongpl]==4.17.3
# via
Expand All @@ -132,7 +132,7 @@ jupyter-client==8.2.0
# qtconsole
jupyter-console==6.6.3
# via jupyter
jupyter-core==5.3.0
jupyter-core==5.3.1
# via
# -c requirements/constraints.in
# ipykernel
Expand Down Expand Up @@ -165,13 +165,13 @@ matplotlib-inline==0.1.6
# via
# ipykernel
# ipython
mistune==2.0.5
mistune==3.0.1
# via nbconvert
nbclassic==1.0.0
# via notebook
nbclient==0.8.0
# via nbconvert
nbconvert==7.5.0
nbconvert==7.6.0
# via
# jupyter
# jupyter-server
Expand Down Expand Up @@ -219,7 +219,7 @@ pip-tools==6.13.0
# via -r requirements/dev.in
pkgutil-resolve-name==1.3.10
# via jsonschema
platformdirs==3.5.3
platformdirs==3.6.0
# via
# -c requirements/test.txt
# jupyter-core
Expand Down Expand Up @@ -359,7 +359,7 @@ typing-extensions==4.6.3
# ipython
uri-template==1.2.0
# via jsonschema
virtualenv==20.23.0
virtualenv==20.23.1
# via pre-commit
wcwidth==0.2.6
# via prompt-toolkit
Expand All @@ -369,7 +369,7 @@ webencodings==0.5.1
# via
# bleach
# tinycss2
websocket-client==1.5.3
websocket-client==1.6.0
# via jupyter-server
wheel==0.40.0
# via
Expand Down
4 changes: 4 additions & 0 deletions requirements/ingest-gcs.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-c constraints.in
-c base.txt
gcsfs
fsspec
105 changes: 105 additions & 0 deletions requirements/ingest-gcs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile requirements/ingest-gcs.in
#
aiohttp==3.8.4
# via gcsfs
aiosignal==1.3.1
# via aiohttp
async-timeout==4.0.2
# via aiohttp
attrs==23.1.0
# via aiohttp
cachetools==5.3.1
# via google-auth
certifi==2023.5.7
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
charset-normalizer==3.1.0
# via
# -c requirements/base.txt
# aiohttp
# requests
decorator==5.1.1
# via gcsfs
frozenlist==1.3.3
# via
# aiohttp
# aiosignal
fsspec==2023.6.0
# via
# -r requirements/ingest-gcs.in
# gcsfs
gcsfs==2023.6.0
# via -r requirements/ingest-gcs.in
google-api-core==2.11.1
# via
# google-cloud-core
# google-cloud-storage
google-auth==2.20.0
# via
# gcsfs
# google-api-core
# google-auth-oauthlib
# google-cloud-core
# google-cloud-storage
google-auth-oauthlib==1.0.0
# via gcsfs
google-cloud-core==2.3.2
# via google-cloud-storage
google-cloud-storage==2.9.0
# via gcsfs
google-crc32c==1.5.0
# via google-resumable-media
google-resumable-media==2.5.0
# via google-cloud-storage
googleapis-common-protos==1.59.1
# via google-api-core
idna==3.4
# via
# -c requirements/base.txt
# requests
# yarl
multidict==6.0.4
# via
# aiohttp
# yarl
oauthlib==3.2.2
# via requests-oauthlib
protobuf==3.20.3
# via
# -c requirements/constraints.in
# google-api-core
pyasn1==0.5.0
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.3.0
# via google-auth
requests==2.31.0
# via
# -c requirements/base.txt
# gcsfs
# google-api-core
# google-cloud-storage
# requests-oauthlib
requests-oauthlib==1.3.1
# via google-auth-oauthlib
rsa==4.9
# via google-auth
six==1.16.0
# via
# -c requirements/base.txt
# google-auth
urllib3==1.26.16
# via
# -c requirements/base.txt
# -c requirements/constraints.in
# google-auth
# requests
yarl==1.9.2
# via aiohttp
6 changes: 3 additions & 3 deletions requirements/ingest-google-drive.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ charset-normalizer==3.1.0
# via
# -c requirements/base.txt
# requests
google-api-core==2.11.0
google-api-core==2.11.1
# via google-api-python-client
google-api-python-client==2.89.0
google-api-python-client==2.90.0
# via -r requirements/ingest-google-drive.in
google-auth==2.20.0
# via
Expand Down Expand Up @@ -47,7 +47,7 @@ pyasn1==0.5.0
# rsa
pyasn1-modules==0.3.0
# via google-auth
pyparsing==3.0.9
pyparsing==3.1.0
# via httplib2
requests==2.31.0
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest-reddit.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ urllib3==1.26.16
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
websocket-client==1.5.3
websocket-client==1.6.0
# via praw
4 changes: 2 additions & 2 deletions requirements/local-inference.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ numpy==1.23.5
# transformers
omegaconf==2.3.0
# via effdet
onnxruntime==1.15.0
onnxruntime==1.15.1
# via unstructured-inference
opencv-python==4.7.0.72
# via
Expand Down Expand Up @@ -136,7 +136,7 @@ pycparser==2.21
# via
# -c requirements/base.txt
# cffi
pyparsing==3.0.9
pyparsing==3.1.0
# via matplotlib
pytesseract==0.3.10
# via layoutparser
Expand Down
Loading

0 comments on commit 3b472cb

Please sign in to comment.