Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion snippets/destination_connectors/astradb.sh.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@ unstructured-ingest \
astradb \
--api-endpoint $ASTRA_DB_API_ENDPOINT \
--token $ASTRA_DB_APPLICATION_TOKEN \
--collection-name $ASTRA_DB_COLLECTION \
--keyspace $ASTRA_DB_KEYSPACE \
--collection-name $ASTRA_DB_COLLECTION
--flatten-metadata

12 changes: 8 additions & 4 deletions snippets/destination_connectors/astradb.v2.py.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ from unstructured_ingest.v2.processes.embedder import EmbedderConfig

if __name__ == "__main__":
Pipeline.from_configs(
context=ProcessorConfig(),
context=ProcessorConfig(
),
indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
downloader_config=LocalDownloaderConfig(),
source_connection_config=LocalConnectionConfig(),
partitioner_config=PartitionerConfig(
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
strategy="hi_res",
additional_partition_args={
"split_pdf_page": True,
"split_pdf_allow_failed": True,
Expand All @@ -46,10 +46,14 @@ if __name__ == "__main__":
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN")
)
),
stager_config=AstraDBUploadStagerConfig(),
stager_config=AstraDBUploadStagerConfig(
flatten_metadata=True
),
uploader_config=AstraDBUploaderConfig(
collection_name=os.getenv("ASTRA_DB_COLLECTION"),
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
collection_name=os.getenv("ASTRA_DB_COLLECTION")
batch_size=20,
record_id_key="record_id"
)
).run()
```
10 changes: 9 additions & 1 deletion snippets/general-shared-text/astradb-cli-api.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,12 @@ These environment variables:
- `ASTRA_DB_API_ENDPOINT` - The API endpoint for the Astra DB database, represented by `--api-endpoint` (CLI) or `api_endpoint` (Python). To get the endpoint, see the **Database Details > API Endpoint** value on your database's **Overview** tab.
- `ASTRA_DB_APPLICATION_TOKEN` - The database application token value for the database, represented by `--token` (CLI) or `token` (Python). To get the token, see the **Database Details > Application Tokens** box on your database's **Overview** tab.
- `ASTRA_DB_KEYSPACE` - The name of the keyspace for the database, represented by `--keyspace` (CLI) or `keyspace` (Python).
- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python).
- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python).

Additional settings include:

- For the source connector only, `--fields` (CLI) or `fields` (Python): Optionally, a comma-separated list (CLI) or an array of strings (Python) of fields
to include in the output. The default is ti include all fields, if not otherwise specified.
- For the destination connector only, `--flatten-metadata` (CLI) or `flatten_metadata=True` (Python): Optionally, whether to "flatten" the metadata. Specifically, the metadata key values are
brought to the top level of the element, and the `metadata` key itself is removed. To not flatten the metadata (the default), specify `--no-flatten_metadata` (CLI) or
`flatten_metadata=False` (Python). The default is is to not flatten the metadata if not otherwise specified.
4 changes: 2 additions & 2 deletions snippets/source_connectors/astradb.sh.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ unstructured-ingest \
astradb \
--api-endpoint $ASTRA_DB_API_ENDPOINT \
--token $ASTRA_DB_APPLICATION_TOKEN \
--keyspace $ASTRA_DB_KEYSPACE \
--collection-name $ASTRA_DB_COLLECTION \
--keyspace $ASTRA_DB_KEYSPACE \
--fields record_id,content \
--download-dir $LOCAL_FILE_DOWNLOAD_DIR \
--partition-by-api \
--api-key $UNSTRUCTURED_API_KEY \
--partition-endpoint $UNSTRUCTURED_API_URL \
--strategy hi_res \
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}"
```
34 changes: 23 additions & 11 deletions snippets/source_connectors/astradb.v2.py.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@ import os

from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig

from unstructured_ingest.v2.processes.connectors.astradb import (
AstraDBAccessConfig,
AstraDBConnectionConfig,
AstraDBDownloaderConfig,
AstraDBIndexerConfig,
AstraDBDownloaderConfig,
AstraDBConnectionConfig,
AstraDBAccessConfig
)

from unstructured_ingest.v2.processes.connectors.local import LocalConnectionConfig
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.connectors.local import LocalUploaderConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig

# Chunking and embedding are optional.

Expand All @@ -19,23 +23,31 @@ if __name__ == "__main__":
context=ProcessorConfig(),
indexer_config=AstraDBIndexerConfig(
collection_name=os.getenv("ASTRA_DB_COLLECTION"),
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
batch_size=20
),
downloader_config=AstraDBDownloaderConfig(
collection_name=os.getenv("ASTRA_DB_COLLECTION"),
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR"),
fields=["record_id", "content"]
),
source_connection_config=AstraDBConnectionConfig(
access_config=AstraDBAccessConfig(
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
),
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN")
)
),
partitioner_config=PartitionerConfig(
partition_by_api=True,
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
additional_partition_args={
"split_pdf_page": True,
"split_pdf_allow_failed": True,
"split_pdf_concurrency_level": 15
}
),
uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")),
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
destination_connection_config=LocalConnectionConfig()
).run()
```