From d0030cda7f8c720251f5e25fcf4cc4e7e01b1f6a Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Mon, 14 Apr 2025 15:56:25 -0700 Subject: [PATCH] Ingest: Discord source connector --- docs.json | 1 + ingestion/source-connectors/discord.mdx | 27 ++++++++++ .../general-shared-text/discord-cli-api.mdx | 13 +++++ snippets/general-shared-text/discord.mdx | 2 + snippets/sc-shared-text/discord-cli-api.mdx | 9 ++++ snippets/source_connectors/discord.sh.mdx | 12 +++++ snippets/source_connectors/discord.v2.py.mdx | 49 +++++++++++++++++++ 7 files changed, 113 insertions(+) create mode 100644 ingestion/source-connectors/discord.mdx create mode 100644 snippets/general-shared-text/discord-cli-api.mdx create mode 100644 snippets/general-shared-text/discord.mdx create mode 100644 snippets/sc-shared-text/discord-cli-api.mdx create mode 100644 snippets/source_connectors/discord.sh.mdx create mode 100644 snippets/source_connectors/discord.v2.py.mdx diff --git a/docs.json b/docs.json index e8a04195..07947983 100644 --- a/docs.json +++ b/docs.json @@ -299,6 +299,7 @@ "ingestion/source-connectors/couchbase", "ingestion/source-connectors/databricks-volumes", "ingestion/source-connectors/delta-table", + "ingestion/source-connectors/discord", "ingestion/source-connectors/dropbox", "ingestion/source-connectors/elastic-search", "ingestion/source-connectors/github", diff --git a/ingestion/source-connectors/discord.mdx b/ingestion/source-connectors/discord.mdx new file mode 100644 index 00000000..bfecb7a2 --- /dev/null +++ b/ingestion/source-connectors/discord.mdx @@ -0,0 +1,27 @@ +--- +title: Discord +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentDiscord from '/snippets/sc-shared-text/discord-cli-api.mdx'; + + + +Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector. + +This example sends data to Unstructured for processing by default. To process data locally instead, see the instructions at the end of this page. + +import DiscordSh from '/snippets/source_connectors/discord.sh.mdx'; +import DiscordPyV2 from '/snippets/source_connectors/discord.v2.py.mdx'; + + + + + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + diff --git a/snippets/general-shared-text/discord-cli-api.mdx b/snippets/general-shared-text/discord-cli-api.mdx new file mode 100644 index 00000000..eea10946 --- /dev/null +++ b/snippets/general-shared-text/discord-cli-api.mdx @@ -0,0 +1,13 @@ +The Discord connector dependencies: + +```bash CLI, Python +pip install "unstructured-ingest[discord]" +``` + +import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx'; + + + +The following environment variables: + +- `DISCORD_ACCESS_TOKEN` - The Discord access token, represented by `--token` (CLI) or `token` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/discord.mdx b/snippets/general-shared-text/discord.mdx new file mode 100644 index 00000000..1e0ac01b --- /dev/null +++ b/snippets/general-shared-text/discord.mdx @@ -0,0 +1,2 @@ +- A Discord [access token](https://discord.com/developers/docs/topics/oauth2). +- The target Discord [channel IDs](https://support.discord.com/hc/articles/206346498-Where-can-I-find-my-User-Server-Message-ID#h_01HRSTXPS5FMK2A5SMVSX4JW4E). \ No newline at end of file diff --git a/snippets/sc-shared-text/discord-cli-api.mdx b/snippets/sc-shared-text/discord-cli-api.mdx new file mode 100644 index 00000000..1b7138b7 --- /dev/null +++ b/snippets/sc-shared-text/discord-cli-api.mdx @@ -0,0 +1,9 @@ +Connect Discord to your preprocessing pipeline, and use the Unstructured CLI or Python to batch process all your documents and store structured outputs locally on your filesystem. + +The requirements are as follows. + +import SharedDiscord from '/snippets/general-shared-text/discord.mdx'; +import SharedDiscordCLIAPI from '/snippets/general-shared-text/discord-cli-api.mdx'; + + + diff --git a/snippets/source_connectors/discord.sh.mdx b/snippets/source_connectors/discord.sh.mdx new file mode 100644 index 00000000..d5152321 --- /dev/null +++ b/snippets/source_connectors/discord.sh.mdx @@ -0,0 +1,12 @@ +```bash CLI +#!/usr/bin/env bash + +unstructured-ingest \ + discord \ + --token $DISCORD_ACCESS_TOKEN \ + --channels 475182341782896651,847950191108554762 \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL +``` \ No newline at end of file diff --git a/snippets/source_connectors/discord.v2.py.mdx b/snippets/source_connectors/discord.v2.py.mdx new file mode 100644 index 00000000..9e308699 --- /dev/null +++ b/snippets/source_connectors/discord.v2.py.mdx @@ -0,0 +1,49 @@ +```python Python Ingest +import os + +from unstructured_ingest.pipeline.pipeline import Pipeline +from unstructured_ingest.interfaces import ProcessorConfig + +from unstructured_ingest.processes.connectors.discord import ( + DiscordIndexerConfig, + DiscordDownloaderConfig, + DiscordConnectionConfig, + DiscordAccessConfig +) + +from unstructured_ingest.processes.partitioner import PartitionerConfig +from unstructured_ingest.processes.chunker import ChunkerConfig +from unstructured_ingest.processes.embedder import EmbedderConfig + +from unstructured_ingest.processes.connectors.local import LocalUploaderConfig + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=DiscordIndexerConfig( + channels=[ + "475182341782896651", + "847950191108554762" + ] + ), + downloader_config=DiscordDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")), + source_connection_config=DiscordConnectionConfig( + access_config=DiscordAccessConfig(token=os.getenv("DISCORD_ACCESS_TOKEN")) + ), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() +``` \ No newline at end of file