diff --git a/api-reference/ingest/destination-connector/astradb.mdx b/api-reference/ingest/destination-connector/astradb.mdx index 2a9653f3..c01b68c9 100644 --- a/api-reference/ingest/destination-connector/astradb.mdx +++ b/api-reference/ingest/destination-connector/astradb.mdx @@ -8,7 +8,7 @@ import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; -Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local destination connector: +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector: import AstraDBAPISh from '/snippets/destination_connectors/astradb.sh.mdx'; import AstraDBAPIPyV2 from '/snippets/destination_connectors/astradb.v2.py.mdx'; diff --git a/api-reference/ingest/destination-connector/azure-cognitive-search.mdx b/api-reference/ingest/destination-connector/azure-cognitive-search.mdx index e62821b9..c0b03158 100644 --- a/api-reference/ingest/destination-connector/azure-cognitive-search.mdx +++ b/api-reference/ingest/destination-connector/azure-cognitive-search.mdx @@ -8,7 +8,7 @@ import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; -Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local destination connector: +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector: import AzureCSAPISh from '/snippets/destination_connectors/azure_cognitive_search.sh.mdx'; import AzureCSAPIPyV2 from '/snippets/destination_connectors/azure_cognitive_search.v2.py.mdx'; diff --git a/api-reference/ingest/source-connectors/airtable.mdx b/api-reference/ingest/source-connectors/airtable.mdx index 9ccfe366..ce851200 100644 --- a/api-reference/ingest/source-connectors/airtable.mdx +++ b/api-reference/ingest/source-connectors/airtable.mdx @@ -2,23 +2,24 @@ title: Airtable --- -import SharedContentAirtable from '/snippets/sc-shared-text/airtable.mdx'; +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentAirtable from '/snippets/sc-shared-text/airtable-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + -Finally, make sure to set the `--partition-by-api` flag and pass in your API key with `--api-key`: +Now call the Unstructured CLI or Python SDK. The destination connector can be any of the ones supported. This example uses the local destination connector: -import AirtableAPISh from '/snippets/source_connectors/airtable_api.sh.mdx'; -import AirtableAPIPy from '/snippets/source_connectors/airtable_api.py.mdx'; +import AirtableAPISh from '/snippets/source_connectors/airtable.sh.mdx'; +import AirtableAPIPyV2 from '/snippets/source_connectors/airtable.v2.py.mdx'; +import AirtableAPIPyV1 from '/snippets/source_connectors/airtable.v1.py.mdx'; - - - - - - - -Additionally, if you're using Unstructured Serverless API, your locally deployed Unstructured API, or an Unstructured API -deployed on Azure or AWS, you also need to specify the API URL via the `--partition-endpoint` argument. \ No newline at end of file + + + \ No newline at end of file diff --git a/api-reference/ingest/source-connectors/astradb.mdx b/api-reference/ingest/source-connectors/astradb.mdx index a3c717c0..895f29e8 100644 --- a/api-reference/ingest/source-connectors/astradb.mdx +++ b/api-reference/ingest/source-connectors/astradb.mdx @@ -12,7 +12,7 @@ import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; -Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector: +Now call the Unstructured CLI or Python SDK. The destination connector can be any of the ones supported. This example uses the local destination connector: import AstraDBAPISh from '/snippets/source_connectors/astradb.sh.mdx'; import AstraDBAPIPyV1 from '/snippets/source_connectors/astradb.v1.py.mdx'; diff --git a/api-reference/ingest/source-connectors/dropbox.mdx b/api-reference/ingest/source-connectors/dropbox.mdx index 88ecebff..0211fbf3 100644 --- a/api-reference/ingest/source-connectors/dropbox.mdx +++ b/api-reference/ingest/source-connectors/dropbox.mdx @@ -12,7 +12,7 @@ import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector: +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: import DropboxAPISh from '/snippets/source_connectors/dropbox.sh.mdx'; import DropboxAPIPyV2 from '/snippets/source_connectors/dropbox.v2.py.mdx'; diff --git a/api-reference/ingest/source-connectors/hubspot.mdx b/api-reference/ingest/source-connectors/hubspot.mdx index 946fdcbb..cf388f25 100644 --- a/api-reference/ingest/source-connectors/hubspot.mdx +++ b/api-reference/ingest/source-connectors/hubspot.mdx @@ -12,7 +12,7 @@ import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector: +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: import HubSpotAPISh from '/snippets/source_connectors/hubspot.sh.mdx'; import HubSpotAPIPyV1 from '/snippets/source_connectors/hubspot.v1.py.mdx'; diff --git a/open-source/ingest/destination-connectors/astradb.mdx b/open-source/ingest/destination-connectors/astradb.mdx index 881083f6..9a56da3b 100644 --- a/open-source/ingest/destination-connectors/astradb.mdx +++ b/open-source/ingest/destination-connectors/astradb.mdx @@ -10,7 +10,7 @@ import SharedAstraDB from '/snippets/dc-shared-text/astradb-cli-api.mdx'; -Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector. +Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector. This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/destination-connectors/azure-cognitive-search.mdx b/open-source/ingest/destination-connectors/azure-cognitive-search.mdx index 1ed5f856..1c7e8cf2 100644 --- a/open-source/ingest/destination-connectors/azure-cognitive-search.mdx +++ b/open-source/ingest/destination-connectors/azure-cognitive-search.mdx @@ -10,7 +10,7 @@ import SharedAzureCS from '/snippets/dc-shared-text/azure-cognitive-search-cli-a -Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector. +Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector. This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/destination-connectors/dropbox.mdx b/open-source/ingest/destination-connectors/dropbox.mdx index c2b546fc..9145d8cf 100644 --- a/open-source/ingest/destination-connectors/dropbox.mdx +++ b/open-source/ingest/destination-connectors/dropbox.mdx @@ -10,7 +10,7 @@ import SharedDropbox from '/snippets/dc-shared-text/dropbox-cli-api.mdx'; -Now call the Unstructured Ingest CLI or Unstructured Ingest Python. The source connector can be any of the ones supported. This example uses the local source connector. +Now call the Unstructured Ingest CLI or Unstructured Ingest Python. The source connector can be any of the ones supported. This example uses the local source connector. This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/destination-connectors/mongodb.mdx b/open-source/ingest/destination-connectors/mongodb.mdx index 7e6025f3..1964bdc8 100644 --- a/open-source/ingest/destination-connectors/mongodb.mdx +++ b/open-source/ingest/destination-connectors/mongodb.mdx @@ -10,7 +10,7 @@ import SharedMongoDB from '/snippets/dc-shared-text/mongodb-cli-api.mdx'; -Now call the Unstructured Ingest CLI or Unstructured Ingest Python. The source connector can be any of the ones supported. This example uses the local source connector. +Now call the Unstructured Ingest CLI or Unstructured Ingest Python. The source connector can be any of the ones supported. This example uses the local source connector. This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/destination-connectors/s3.mdx b/open-source/ingest/destination-connectors/s3.mdx index f367160e..548ed21b 100644 --- a/open-source/ingest/destination-connectors/s3.mdx +++ b/open-source/ingest/destination-connectors/s3.mdx @@ -10,7 +10,7 @@ import SharedS3 from '/snippets/dc-shared-text/s3-cli-api.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector. +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector. This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/destination-connectors/sftp.mdx b/open-source/ingest/destination-connectors/sftp.mdx index c736cf38..91b722aa 100644 --- a/open-source/ingest/destination-connectors/sftp.mdx +++ b/open-source/ingest/destination-connectors/sftp.mdx @@ -10,7 +10,7 @@ import SharedSFTP from '/snippets/dc-shared-text/sftp-cli-api.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector. +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector. This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/destination-connectors/singlestore.mdx b/open-source/ingest/destination-connectors/singlestore.mdx index 5226ae0d..f055f597 100644 --- a/open-source/ingest/destination-connectors/singlestore.mdx +++ b/open-source/ingest/destination-connectors/singlestore.mdx @@ -10,7 +10,7 @@ import SharedSingleStore from '/snippets/dc-shared-text/singlestore-cli-api.mdx' -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector. +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector. This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/destination-connectors/weaviate.mdx b/open-source/ingest/destination-connectors/weaviate.mdx index ec5143b4..5333628b 100644 --- a/open-source/ingest/destination-connectors/weaviate.mdx +++ b/open-source/ingest/destination-connectors/weaviate.mdx @@ -10,7 +10,7 @@ import SharedWeaviate from '/snippets/dc-shared-text/weaviate-cli-api.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector: +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector: This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/source-connectors/airtable.mdx b/open-source/ingest/source-connectors/airtable.mdx index 6cd92cc2..861f632c 100644 --- a/open-source/ingest/source-connectors/airtable.mdx +++ b/open-source/ingest/source-connectors/airtable.mdx @@ -2,21 +2,28 @@ title: Airtable --- -import SharedContentAirtable from '/snippets/sc-shared-text/airtable.mdx'; +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentAirtable from '/snippets/sc-shared-text/airtable-cli-api.mdx'; +Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector. + +This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. import AirtableSh from '/snippets/source_connectors/airtable.sh.mdx'; -import AirtablePy from '/snippets/source_connectors/airtable.py.mdx'; +import AirtablePyV2 from '/snippets/source_connectors/airtable.v2.py.mdx'; +import AirtablePyV1 from '/snippets/source_connectors/airtable.v1.py.mdx'; - - - - + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; -For a full list of the options that the Unstructured Ingest CLI accepts check `unstructured-ingest airtable --help`. + \ No newline at end of file diff --git a/open-source/ingest/source-connectors/astradb.mdx b/open-source/ingest/source-connectors/astradb.mdx index 89d973ad..4e234123 100644 --- a/open-source/ingest/source-connectors/astradb.mdx +++ b/open-source/ingest/source-connectors/astradb.mdx @@ -10,7 +10,7 @@ import SharedContentAstraDB from '/snippets/sc-shared-text/astradb-cli-api.mdx'; -Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector. +Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector. This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/source-connectors/azure.mdx b/open-source/ingest/source-connectors/azure.mdx index 075b288f..d1c8f4cd 100644 --- a/open-source/ingest/source-connectors/azure.mdx +++ b/open-source/ingest/source-connectors/azure.mdx @@ -10,7 +10,7 @@ import SharedContentAzure from '/snippets/sc-shared-text/azure-cli-api.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector. +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector. This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/source-connectors/dropbox.mdx b/open-source/ingest/source-connectors/dropbox.mdx index a099aca7..c125c575 100644 --- a/open-source/ingest/source-connectors/dropbox.mdx +++ b/open-source/ingest/source-connectors/dropbox.mdx @@ -10,7 +10,7 @@ import SharedContentDropbox from '/snippets/sc-shared-text/dropbox-cli-api.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector. +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector. This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/source-connectors/mongodb.mdx b/open-source/ingest/source-connectors/mongodb.mdx index c23f2136..2bd5dffe 100644 --- a/open-source/ingest/source-connectors/mongodb.mdx +++ b/open-source/ingest/source-connectors/mongodb.mdx @@ -10,7 +10,7 @@ import SharedContentMongoDB from '/snippets/sc-shared-text/mongodb-cli-api.mdx'; -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. diff --git a/open-source/ingest/source-connectors/one-drive.mdx b/open-source/ingest/source-connectors/one-drive.mdx index 9e01e05a..0304fb0e 100644 --- a/open-source/ingest/source-connectors/one-drive.mdx +++ b/open-source/ingest/source-connectors/one-drive.mdx @@ -10,7 +10,7 @@ import SharedContentOneDrive from '/snippets/sc-shared-text/onedrive-cli-api.mdx -Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. diff --git a/snippets/general-shared-text/airtable-cli-api.mdx b/snippets/general-shared-text/airtable-cli-api.mdx new file mode 100644 index 00000000..971a0804 --- /dev/null +++ b/snippets/general-shared-text/airtable-cli-api.mdx @@ -0,0 +1,14 @@ +The Airtable connector dependencies: + +```bash CLI, Python +pip install "unstructured-ingest[airtable]" +``` + +import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx'; + + + +These environment variables: + +- `AIRTABLE_TOKEN` - The Airtable personal access token, represented by `--personal-access-token` (CLI) or `personal_access_token` (Python). +- `AIRTABLE_PATHS` - The list of Airtable paths to process, represented by `--list-of-paths` (CLI) or `list_of_paths` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/airtable.mdx b/snippets/general-shared-text/airtable.mdx new file mode 100644 index 00000000..f4f38ac4 --- /dev/null +++ b/snippets/general-shared-text/airtable.mdx @@ -0,0 +1,27 @@ +The Airtable connector prerequisites: + +- An [Airtable](https://www.airtable.com/) account. [Create a free Airtable account](https://airtable.com/signup). +- An Airtable personal access token. [Create a personal access token](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens). +- The ID of the Airtable base to access. [Create a base](https://www.airtable.com/guides/build/create-a-base). [Get a base's ID](https://support.airtable.com/docs/finding-airtable-ids#finding-base-url-ids). +- The ID of the table to access in the base. [Create a table](https://www.airtable.com/guides/build/create-a-table). [Get a table's ID](https://support.airtable.com/docs/finding-airtable-ids#finding-base-url-ids). +- The ID of the view to access in the table. [Create a view](https://www.airtable.com/guides/build/create-custom-views-of-data). [Get a view's ID](https://support.airtable.com/docs/finding-airtable-ids#finding-base-url-ids). + +By default, Unstructured processes all tables from all bases within an Airtable organization. You can limit the +tables that Unstructured ingests data from within Airtable by specifying a list of Airtable paths. +An Airtable path uses the following structure: `base_id/table_id(optional)/view_id(optional)` + +For example, given the following example Airtable URL: + +```text +https://airtable.com/appr9nKeXLAtg6bgn/tblZ8uT1GY7NLbWit/viwDcpzf9dP0Gqz5J +``` + +- The base's ID is `appr9nKeXLAtg6bgn`. The base's path is `appr9nKeXLAtg6bgn`. +- The table's ID is `tblZ8uT1GY7NLbWit`. The table's path is `appr9nKeXLAtg6bgn/tblZ8uT1GY7NLbWit`. +- The view's ID is `viwDcpzf9dP0Gqz5J`. The view's path is `appr9nKeXLAtg6bgn/tblZ8uT1GY7NLbWit/viwDcpzf9dP0Gqz5J`. + +You can call the Airtable API to get lists of available IDs for Airtable bases, tables, and views in bulk, as follows: + +- [Base IDs](https://airtable.com/developers/web/api/list-bases) +- [Table and view IDs](https://airtable.com/developers/web/api/get-base-schema) +- [Base, table, and view IDs](https://pyairtable.readthedocs.io/en/latest/metadata.html) diff --git a/snippets/sc-shared-text/airtable-cli-api.mdx b/snippets/sc-shared-text/airtable-cli-api.mdx new file mode 100644 index 00000000..73ce1d28 --- /dev/null +++ b/snippets/sc-shared-text/airtable-cli-api.mdx @@ -0,0 +1,9 @@ +Connect Airtable to your preprocessing pipeline, and use the Unstructured Ingest CLI or the Unstructured Ingest Python library to batch process all your data and store structured outputs locally on your filesystem. + +You will need: + +import AirtableShared from '/snippets/general-shared-text/airtable.mdx'; +import AirtableSharedCLIAPI from '/snippets/general-shared-text/airtable-cli-api.mdx'; + + + \ No newline at end of file diff --git a/snippets/sc-shared-text/airtable.mdx b/snippets/sc-shared-text/airtable.mdx deleted file mode 100644 index 5ca3cebb..00000000 --- a/snippets/sc-shared-text/airtable.mdx +++ /dev/null @@ -1,22 +0,0 @@ - -Connect Airtable to your preprocessing pipeline, and batch process all your documents using `unstructured-ingest` to -store structured outputs locally on your filesystem. - -Make sure to have the Airtable dependencies installed: - - ```bash Shell - pip install "unstructured-ingest[airtable]" - ``` - -Before connecting your preprocessing pipeline to Airtable, obtain a personal access token to authenticate into Airtable. -Check [Airtable documentation](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens) for more info. - -Unless otherwise specified, Unstructured will process all tables within each and every base within an Airtable org. -Optionally, you can choose to specify the locations to ingest data from within Airtable using the `--list-of-paths` argument -(`list_of_paths` in Python example). -An Airtable path has the following structure: `base_id/table_id(optional)/view_id(optional)/` - -Refer to Airtable documentation to learn how you can obtain ids in bulk: -* [base ids](https://airtable.com/developers/web/api/list-bases) -* [table and view ids](https://airtable.com/developers/web/api/get-base-schema) -* [base, table and view ids](https://pyairtable.readthedocs.io/en/latest/metadata.html) diff --git a/snippets/source_connectors/airtable.py.mdx b/snippets/source_connectors/airtable.py.mdx deleted file mode 100644 index 5688c007..00000000 --- a/snippets/source_connectors/airtable.py.mdx +++ /dev/null @@ -1,30 +0,0 @@ -```python Python -import os - -from unstructured_ingest.connector.airtable import AirtableAccessConfig, SimpleAirtableConfig -from unstructured_ingest.interfaces import ( - PartitionConfig, - ProcessorConfig, - ReadConfig, -) -from unstructured_ingest.runner import AirtableRunner - -if __name__ == "__main__": - runner = AirtableRunner( - processor_config=ProcessorConfig( - verbose=True, - output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"), - num_processes=2, - ), - read_config=ReadConfig(), - partition_config=PartitionConfig( - strategy="hi_res", - ), - connector_config=SimpleAirtableConfig( - access_config=AirtableAccessConfig( - personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN") - ), - ), - ) - runner.run() -``` diff --git a/snippets/source_connectors/airtable.sh.mdx b/snippets/source_connectors/airtable.sh.mdx index 81c0d71f..9a33a1a0 100644 --- a/snippets/source_connectors/airtable.sh.mdx +++ b/snippets/source_connectors/airtable.sh.mdx @@ -1,12 +1,13 @@ -```bash Shell +```bash CLI #!/usr/bin/env bash unstructured-ingest \ airtable \ - --metadata-exclude filename,file_directory,metadata.data_source.date_processed \ --personal-access-token $AIRTABLE_PERSONAL_ACCESS_TOKEN \ --output-dir $LOCAL_FILE_OUTPUT_DIR \ --num-processes 2 \ --reprocess \ - --strategy hi_res + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL ``` diff --git a/snippets/source_connectors/airtable_api.py.mdx b/snippets/source_connectors/airtable.v1.py.mdx similarity index 86% rename from snippets/source_connectors/airtable_api.py.mdx rename to snippets/source_connectors/airtable.v1.py.mdx index decac453..7d89fd94 100644 --- a/snippets/source_connectors/airtable_api.py.mdx +++ b/snippets/source_connectors/airtable.v1.py.mdx @@ -1,4 +1,4 @@ -```python Python +```python Python Ingest v1 import os from unstructured_ingest.connector.airtable import AirtableAccessConfig, SimpleAirtableConfig @@ -19,6 +19,7 @@ if __name__ == "__main__": read_config=ReadConfig(), partition_config=PartitionConfig( partition_by_api=True, + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), api_key=os.getenv("UNSTRUCTURED_API_KEY"), strategy="hi_res", ), @@ -26,6 +27,7 @@ if __name__ == "__main__": access_config=AirtableAccessConfig( personal_access_token=os.getenv("AIRTABLE_PERSONAL_ACCESS_TOKEN") ), + list_of_paths=[os.getenv("AIRTABLE_PATHS")] ), ) runner.run() diff --git a/snippets/source_connectors/airtable.v2.py.mdx b/snippets/source_connectors/airtable.v2.py.mdx new file mode 100644 index 00000000..fdc397d8 --- /dev/null +++ b/snippets/source_connectors/airtable.v2.py.mdx @@ -0,0 +1,43 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.airtable import ( + AirtableIndexerConfig, + AirtableDownloaderConfig, + AirtableConnectionConfig, + AirtableAccessConfig +) + +from unstructured_ingest.v2.processes.connectors.local import LocalUploaderConfig +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig + +# Chunking is optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=AirtableIndexerConfig(list_of_paths=[os.getenv("AIRTABLE_PATHS")]), + downloader_config=AirtableDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")), + source_connection_config=AirtableConnectionConfig( + access_config=AirtableAccessConfig( + personal_access_token=os.getenv("AIRTABLE_TOKEN") + ) + ), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() +``` \ No newline at end of file diff --git a/snippets/source_connectors/airtable_api.sh.mdx b/snippets/source_connectors/airtable_api.sh.mdx deleted file mode 100644 index c0e03daa..00000000 --- a/snippets/source_connectors/airtable_api.sh.mdx +++ /dev/null @@ -1,14 +0,0 @@ -```bash Shell -#!/usr/bin/env bash - -unstructured-ingest \ - airtable \ - --metadata-exclude filename,file_directory,metadata.data_source.date_processed \ - --personal-access-token $AIRTABLE_PERSONAL_ACCESS_TOKEN \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --num-processes 2 \ - --reprocess \ - --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --strategy hi_res -```