From 5ec98bd323702fbbf376825b7484b4ec481c6c76 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 24 Oct 2024 09:25:01 -0700 Subject: [PATCH] Slack v2 source connector --- .../ingest/source-connectors/slack.mdx | 26 +++++----- .../ingest/source-connectors/slack.mdx | 23 ++++++--- .../general-shared-text/slack-cli-api.mdx | 25 ++++++++++ snippets/general-shared-text/slack.mdx | 15 ++++++ snippets/sc-shared-text/slack-cli-api.mdx | 9 ++++ snippets/sc-shared-text/slack.mdx | 17 ------- snippets/source_connectors/slack.sh.mdx | 18 ++++--- .../{slack.py.mdx => slack.v1.py.mdx} | 11 +++-- snippets/source_connectors/slack.v2.py.mdx | 48 +++++++++++++++++++ snippets/source_connectors/slack_api.py.mdx | 31 ------------ snippets/source_connectors/slack_api.sh.mdx | 15 ------ 11 files changed, 145 insertions(+), 93 deletions(-) create mode 100644 snippets/general-shared-text/slack-cli-api.mdx create mode 100644 snippets/general-shared-text/slack.mdx create mode 100644 snippets/sc-shared-text/slack-cli-api.mdx delete mode 100644 snippets/sc-shared-text/slack.mdx rename snippets/source_connectors/{slack.py.mdx => slack.v1.py.mdx} (70%) create mode 100644 snippets/source_connectors/slack.v2.py.mdx delete mode 100644 snippets/source_connectors/slack_api.py.mdx delete mode 100644 snippets/source_connectors/slack_api.sh.mdx diff --git a/api-reference/ingest/source-connectors/slack.mdx b/api-reference/ingest/source-connectors/slack.mdx index 4edba749..a0361dd5 100644 --- a/api-reference/ingest/source-connectors/slack.mdx +++ b/api-reference/ingest/source-connectors/slack.mdx @@ -2,22 +2,24 @@ title: Slack --- -import SharedContentSlack from '/snippets/sc-shared-text/slack.mdx'; +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentSlack from '/snippets/sc-shared-text/slack-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + -Make sure to set the `--partition-by-api` flag and pass in your API key with `--api-key`: +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: -import SlackAPISh from '/snippets/source_connectors/slack_api.sh.mdx'; -import SlackAPIPy from '/snippets/source_connectors/slack_api.py.mdx'; +import SlackAPISh from '/snippets/source_connectors/slack.sh.mdx'; +import SlackAPIPyV2 from '/snippets/source_connectors/slack.v2.py.mdx'; +import SlackAPIPyV1 from '/snippets/source_connectors/slack.v1.py.mdx'; - - - - - - -Additionally, if you're using Unstructured Serverless API, your locally deployed Unstructured API, or an Unstructured API -deployed on Azure or AWS, you also need to specify the API URL via the `--partition-endpoint` argument. \ No newline at end of file + + + \ No newline at end of file diff --git a/open-source/ingest/source-connectors/slack.mdx b/open-source/ingest/source-connectors/slack.mdx index 48d786d6..529340fe 100644 --- a/open-source/ingest/source-connectors/slack.mdx +++ b/open-source/ingest/source-connectors/slack.mdx @@ -2,19 +2,28 @@ title: Slack --- -import SharedContentSlack from '/snippets/sc-shared-text/slack.mdx'; +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentSlack from '/snippets/sc-shared-text/slack-cli-api.mdx'; +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector. + +This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. + import SlackSh from '/snippets/source_connectors/slack.sh.mdx'; -import SlackPy from '/snippets/source_connectors/slack.py.mdx'; +import SlackPyV2 from '/snippets/source_connectors/slack.v2.py.mdx'; +import SlackPyV1 from '/snippets/source_connectors/slack.v1.py.mdx'; - - - - + + -For a full list of the options the Unstructured Ingest CLI accepts check `unstructured-ingest slack --help`. \ No newline at end of file +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + \ No newline at end of file diff --git a/snippets/general-shared-text/slack-cli-api.mdx b/snippets/general-shared-text/slack-cli-api.mdx new file mode 100644 index 00000000..e7a2df44 --- /dev/null +++ b/snippets/general-shared-text/slack-cli-api.mdx @@ -0,0 +1,25 @@ +The Slack connector dependencies: + +```bash +pip install "unstructured-ingest[slack]" +``` + +import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx'; + + + +These environment variables: + +- `SLACK_BOT_USER_OAUTH_TOKEN` - The OAuth token for the Slack app, represented by `--token` (CLI) or `token` (Python). + +To specify the starting and ending date and time range for the channels to be processed: + +- For the CLI, use one of the following supported formats: + + - `YYYY-MM-DD` + - `YYYY-MM-DDTHH:MM:SS` + - `YYYY-MM-DDTHH:MM:SSZ` + - `YYYY-MM-DD+HH:MM:SS` + - `YYYY-MM-DD-HH:MM:SS` + +- For Python, use the `datetime.datetime` function. diff --git a/snippets/general-shared-text/slack.mdx b/snippets/general-shared-text/slack.mdx new file mode 100644 index 00000000..c3a14f15 --- /dev/null +++ b/snippets/general-shared-text/slack.mdx @@ -0,0 +1,15 @@ +The Slack prerequisites: + +- A Slack app. Create a Slack app by following [Step 1: Creating an app](https://api.slack.com/quickstart#creating). +- The app must have the `channels:history` OAuth scope. Give the app this scope by following [Step 2: Requesting scopes](https://api.slack.com/quickstart#scopes). +- The app must be installed and authorized for the target Slack workspace. Install and authorize the app by following [Step 3: Installing and authorizing the app](https://api.slack.com/quickstart#installing). +- The app's access token. Get this token by following [Step 3: Installing and authorizing the app](https://api.slack.com/quickstart#installing). +- Add the app to the target channels in the Slack workspace. To do this from the channel, open the channel's details page, click the **Integrations** tab, click **Add apps**, and follow the on-screen directions to install the app. +- The channel ID for each target channel. To get this ID, open the channel's details page, and look for the **Channel ID** field on the **About** tab. +- The starting and ending date and time range for the channels to be processed. Supported formats include: + + - `YYYY-MM-DD` + - `YYYY-MM-DDTHH:MM:SS` + - `YYYY-MM-DDTHH:MM:SSZ` + - `YYYY-MM-DD+HH:MM:SS` + - `YYYY-MM-DD-HH:MM:SS` \ No newline at end of file diff --git a/snippets/sc-shared-text/slack-cli-api.mdx b/snippets/sc-shared-text/slack-cli-api.mdx new file mode 100644 index 00000000..046f9e26 --- /dev/null +++ b/snippets/sc-shared-text/slack-cli-api.mdx @@ -0,0 +1,9 @@ +Connect Slack to your preprocessing pipeline, and use the Unstructured Ingest CLI or the Unstructured Ingest Python library to batch process all your documents and store structured outputs locally on your filesystem. + +You will need: + +import SharedSlack from '/snippets/general-shared-text/slack.mdx'; +import SharedSlackCLIAPI from '/snippets/general-shared-text/slack-cli-api.mdx'; + + + \ No newline at end of file diff --git a/snippets/sc-shared-text/slack.mdx b/snippets/sc-shared-text/slack.mdx deleted file mode 100644 index adb47454..00000000 --- a/snippets/sc-shared-text/slack.mdx +++ /dev/null @@ -1,17 +0,0 @@ -Connect Slack to your preprocessing pipeline, and batch process all your documents using `unstructured-ingest` to store structured outputs locally on your filesystem. - -First, install the Slack dependencies as shown here. - -```bash -pip install "unstructured-ingest[slack]" -``` - -To authenticate the Slack source connector provide the following: -* `token`: Bot token used to access Slack API, must have `channels:history` scope for the bot user. - -Provide a comma-delimited list of Slack channel IDs to pull messages from, these can be a public or private channels. -Your bot user must be in the channels for them to be ingested. - -Optionally, you can specify: -* `start-date`: Start date/time in formats `YYYY-MM-DD`, `YYYY-MM-DDTHH:MM:SS`, `YYYY-MM-DD+HH:MM:SS`, or `YYYY-MM-DDTHH:MM:SStz` -* `end-date`: End date/time in formats `YYYY-MM-DD`, `YYYY-MM-DDTHH:MM:SS`, `YYYY-MM-DD+HH:MM:SS`, or `YYYY-MM-DDTHH:MM:SStz` diff --git a/snippets/source_connectors/slack.sh.mdx b/snippets/source_connectors/slack.sh.mdx index ee534ffc..48cefb74 100644 --- a/snippets/source_connectors/slack.sh.mdx +++ b/snippets/source_connectors/slack.sh.mdx @@ -1,13 +1,19 @@ -```bash Shell +```bash CLI #!/usr/bin/env bash +# Chunking and embedding are optional. + unstructured-ingest \ slack \ - --channels 12345678 \ - --token 12345678 \ + --token $SLACK_BOT_USER_OAUTH_TOKEN \ + --channels C03FVNHR70A,C03FVNRG43D \ + --start-date 2024-10-22 \ + --end-date 2024-10-23 \ --download-dir $LOCAL_FILE_DOWNLOAD_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --start-date 2023-04-01T01:00:00-08:00 \ - --end-date 2023-04-02 \ - --strategy hi_res + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL ``` diff --git a/snippets/source_connectors/slack.py.mdx b/snippets/source_connectors/slack.v1.py.mdx similarity index 70% rename from snippets/source_connectors/slack.py.mdx rename to snippets/source_connectors/slack.v1.py.mdx index 9fec0c40..d23cda87 100644 --- a/snippets/source_connectors/slack.py.mdx +++ b/snippets/source_connectors/slack.v1.py.mdx @@ -1,5 +1,6 @@ -```python Python +```python Python Ingest v1 import os +from datetime import datetime from unstructured_ingest.connector.slack import SimpleSlackConfig, SlackAccessConfig from unstructured_ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig @@ -18,11 +19,11 @@ if __name__ == "__main__": ), connector_config=SimpleSlackConfig( access_config=SlackAccessConfig( - token=os.getenv("SLACK_TOKEN"), + token=os.getenv("SLACK_BOT_USER_OAUTH_TOKEN"), ), - channels=["12345678"], - start_date="2023-04-01T01:00:00-08:00", - end_date="2023-04-02,", + channels=["C03FVNHR70A", "C03FVNRG43D"], + start_date=datetime(year=2024, month=10, day=22), + end_date=datetime(year=2024, month=10, day=23) ), ) runner.run() diff --git a/snippets/source_connectors/slack.v2.py.mdx b/snippets/source_connectors/slack.v2.py.mdx new file mode 100644 index 00000000..5ec86e46 --- /dev/null +++ b/snippets/source_connectors/slack.v2.py.mdx @@ -0,0 +1,48 @@ +```python Python Ingest v2 +import os +from datetime import datetime + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.slack import ( + SlackIndexerConfig, + SlackDownloaderConfig, + SlackConnectionConfig, + SlackAccessConfig +) + +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig +from unstructured_ingest.v2.processes.connectors.local import LocalUploaderConfig + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=SlackIndexerConfig( + channels=["C03FVNHR70A", "C03FVNRG43D"], + start_date=datetime(year=2024, month=10, day=22), + end_date=datetime(year=2024, month=10, day=23) + ), + downloader_config=SlackDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")), + source_connection_config=SlackConnectionConfig( + access_config=SlackAccessConfig(token=os.getenv("SLACK_BOT_USER_OAUTH_TOKEN")) + ), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() +``` \ No newline at end of file diff --git a/snippets/source_connectors/slack_api.py.mdx b/snippets/source_connectors/slack_api.py.mdx deleted file mode 100644 index 776a766e..00000000 --- a/snippets/source_connectors/slack_api.py.mdx +++ /dev/null @@ -1,31 +0,0 @@ -```python Python -import os - -from unstructured_ingest.connector.slack import SimpleSlackConfig, SlackAccessConfig -from unstructured_ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig -from unstructured_ingest.runner import SlackRunner - -if __name__ == "__main__": - runner = SlackRunner( - processor_config=ProcessorConfig( - verbose=True, - output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"), - num_processes=2, - ), - read_config=ReadConfig(), - partition_config=PartitionConfig( - partition_by_api=True, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - strategy="hi_res", - ), - connector_config=SimpleSlackConfig( - access_config=SlackAccessConfig( - token=os.getenv("SLACK_TOKEN"), - ), - channels=["12345678"], - start_date="2023-04-01T01:00:00-08:00", - end_date="2023-04-02,", - ), - ) - runner.run() -``` diff --git a/snippets/source_connectors/slack_api.sh.mdx b/snippets/source_connectors/slack_api.sh.mdx deleted file mode 100644 index eb6bb8ed..00000000 --- a/snippets/source_connectors/slack_api.sh.mdx +++ /dev/null @@ -1,15 +0,0 @@ -```bash Shell -#!/usr/bin/env bash - -unstructured-ingest \ - slack \ - --channels 12345678 \ - --token 12345678 \ - --download-dir $LOCAL_FILE_DOWNLOAD_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --start-date 2023-04-01T01:00:00-08:00 \ - --end-date 2023-04-02 \ - --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --strategy hi_res -```