-
Notifications
You must be signed in to change notification settings - Fork 613
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add google cloud storage connector #746
Changes from 12 commits
28d8d60
43ef3bc
56e4399
e2de170
2fa6fb5
0e229ef
8825035
639a5d9
9eda17a
8cf6242
79a4706
eddc385
a976fa7
d0369a1
612aa94
2284baf
66a063c
40c6d77
9c9b349
46aee8e
902076f
07b8536
c3ce8d8
a9f48c0
53a4cc8
aeeb907
fc9775e
b58207c
b45ddeb
d22df8e
959742a
06bd011
173eebe
11b69a7
ad5282e
da99841
d33f0de
6f806af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Processes several files in a nested folder structure from gs://unstructured_public/ | ||
# through Unstructured's library in 2 processes. | ||
|
||
# Structured outputs are stored in gcs-output/ | ||
|
||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) | ||
cd "$SCRIPT_DIR"/../../.. || exit 1 | ||
|
||
PYTHONPATH=. ./unstructured/ingest/main.py \ | ||
--remote-url gs://unstructured_public/ \ | ||
--structured-output-dir gcs-output \ | ||
--num-processes 2 \ | ||
--recursive \ | ||
--verbose | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
-c constraints.in | ||
-c base.txt | ||
gcsfs | ||
fsspec |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
[ | ||
{ | ||
"element_id": "855ecc17dee3ddb9d89d8f48740c9853", | ||
"text": "MIME-Version: 1.0 Date: Fri, 16 Dec 2022 17:04:16 -0500 Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com> Subject: Test Email From: Matthew Robinson <mrobinson@unstructured.io> To: Matthew Robinson <mrobinson@unstructured.io> Content-Type: multipart/alternative; boundary=\"00000000000095c9b205eff92630\"", | ||
"type": "UncategorizedText", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "c3db8e6c584627c190cc8e1750bdac9c", | ||
"text": "-00000000000095c9b205eff92630", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "b91c2196ba2a3190ec703710671918b2", | ||
"text": "Content-Type: text/plain; charset=\"UTF-8\"", | ||
"type": "Title", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", | ||
"text": "This is a test email to use for unit tests.", | ||
"type": "NarrativeText", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "9c218520320f238595f1fde74bdd137d", | ||
"text": "Important points:", | ||
"type": "Title", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "8522061b991b1db70453502d328fe07e", | ||
"text": "Roses are red", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "c3c4527761d4e4b8d0a4c4a0d46954c8", | ||
"text": "Violets are blue", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "c3db8e6c584627c190cc8e1750bdac9c", | ||
"text": "-00000000000095c9b205eff92630", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "c30942ddb17655a8226bf9d50b5c2fb2", | ||
"text": "Content-Type: text/html; charset=\"UTF-8\"", | ||
"type": "Title", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "9f5297daa98b670a4529a64fb1e29067", | ||
"text": "<div dir=\"ltr\"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>", | ||
"type": "NarrativeText", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "f09df5cef9c41280d2d859b808e5f658", | ||
"text": "-00000000000095c9b205eff92630--", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
[ | ||
{ | ||
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396", | ||
"text": "This is a test document to use for unit tests.", | ||
"type": "NarrativeText", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "a9d4657034aa3fdb5177f1325e912362", | ||
"text": "Doylestown, PA 18901", | ||
"type": "Address", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "9c218520320f238595f1fde74bdd137d", | ||
"text": "Important points:", | ||
"type": "Title", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414", | ||
"text": "Hamburgers are delicious", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "fc1adcb8eaceac694e500a103f9f698f", | ||
"text": "Dogs are the best", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
}, | ||
{ | ||
"element_id": "0b61e826b1c4ab05750184da72b89f83", | ||
"text": "I love fuzzy blankets", | ||
"type": "ListItem", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/plain", | ||
"page_number": 1 | ||
} | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
[ | ||
{ | ||
"element_id": "c08fcabe68ba13b7a7cc6592bd5513a8", | ||
"text": "January 2023(Someone fed my essays into GPT to make something that could answer\nquestions based on them, then asked it where good ideas come from. The\nanswer was ok, but not what I would have said. This is what I would have said.)The way to get new ideas is to notice anomalies: what seems strange,\nor missing, or broken? You can see anomalies in everyday life (much\nof standup comedy is based on this), but the best place to look for\nthem is at the frontiers of knowledge.Knowledge grows fractally.\nFrom a distance its edges look smooth, but when you learn enough\nto get close to one, you'll notice it's full of gaps. These gaps\nwill seem obvious; it will seem inexplicable that no one has tried\nx or wondered about y. In the best case, exploring such gaps yields\nwhole new fractal buds.", | ||
"type": "NarrativeText", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/html", | ||
"page_number": 1 | ||
} | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
[ | ||
{ | ||
"element_id": "f8db6c6e535705336195aa2c1d23d414", | ||
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 13\n \n \n", | ||
"type": "Table", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||
"page_number": 1, | ||
"page_name": "Stanley Cups", | ||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" | ||
} | ||
}, | ||
{ | ||
"element_id": "20f5163a43ac6eb04a40d269d3ad0663", | ||
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 0\n \n \n", | ||
"type": "Table", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||
"page_number": 2, | ||
"page_name": "Stanley Cups Since 67", | ||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>" | ||
} | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[ | ||
{ | ||
"element_id": "f8db6c6e535705336195aa2c1d23d414", | ||
"text": "\n \n \n Team\n Location\n Stanley Cups\n \n \n Blues\n STL\n 1\n \n \n Flyers\n PHI\n 2\n \n \n Maple Leafs\n TOR\n 13\n \n \n", | ||
"type": "Table", | ||
"metadata": { | ||
"data_source": {}, | ||
"filetype": "text/csv", | ||
"page_number": 1, | ||
"text_as_html": "<table border=\"1\" class=\"dataframe\">\n <tbody>\n <tr>\n <td>Team</td>\n <td>Location</td>\n <td>Stanley Cups</td>\n </tr>\n <tr>\n <td>Blues</td>\n <td>STL</td>\n <td>1</td>\n </tr>\n <tr>\n <td>Flyers</td>\n <td>PHI</td>\n <td>2</td>\n </tr>\n <tr>\n <td>Maple Leafs</td>\n <td>TOR</td>\n <td>13</td>\n </tr>\n </tbody>\n</table>" | ||
} | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
|
||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) | ||
cd "$SCRIPT_DIR"/.. || exit 1 | ||
|
||
if [[ "$(find test_unstructured_ingest/expected-structured-output/google-cloud-storage/ -type f -size +2 | wc -l)" -ne 5 ]]; then | ||
echo "The test fixtures in test_unstructured_ingest/expected-structured-output/ look suspicious. At least one of the files is too small." | ||
echo "Did you overwrite test fixtures with bad outputs?" | ||
exit 1 | ||
fi | ||
|
||
PYTHONPATH=. ./unstructured/ingest/main.py \ | ||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \ | ||
--remote-url gs://unstructured_public/ \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's use this test folder: gs://utic-test-ingest-fixtures There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worked. But had to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. huh, weird. yea, as long as it works locally and in CI, that works for me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looking at CI, it doesn't look like linux is happy about the
https://github.com/Unstructured-IO/unstructured/actions/runs/5330890729/jobs/9658103678 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, right, I see the confusion here. (should have noticed before). I think this wasn't working for you locally because you were setting GCP_INGEST_SERVICE_KEY to a filepath. GCP_INGEST_SERVICE_KEY is the actual key itself. This should works with echo on any platform. |
||
--structured-output-dir gcs-output \ | ||
--recursive \ | ||
--preserve-downloads \ | ||
--reprocess | ||
|
||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} | ||
|
||
set +e | ||
|
||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64 | ||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then | ||
|
||
cp gcs-output* test_unstructured_ingest/expected-structured-output/google-cloud-storage | ||
|
||
elif ! diff -ru test_unstructured_ingest/expected-structured-output/google-cloud-storage gcs-output ; then | ||
echo | ||
echo "There are differences from the previously checked-in structured outputs." | ||
echo | ||
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:" | ||
echo | ||
echo " export OVERWRITE_FIXTURES=true" | ||
echo | ||
echo "and then rerun this script." | ||
echo | ||
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware" | ||
echo "to update fixtures for CI," | ||
echo | ||
exit 1 | ||
|
||
fi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I pushed up a gcs test bucket with private and public paths, we should update to use that. I'll follow up with details directly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's use gs://utic-test-ingest-fixtures
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would require the user to have authentication token. Is that what we really want? Or do we want a no auth example?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh sorry, this is for the example. yes, no auth. I'll see if I can set up a public bucket for this.