From 008d96b3502322c9c9c215c353fcb164f2e2cb54 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 24 Oct 2024 10:35:49 -0700 Subject: [PATCH] SQLite v2 source connector --- .../ingest/source-connectors/sqlite.mdx | 23 ++++++++ mint.json | 2 + .../ingest/source-connectors/sqlite.mdx | 27 ++++++++++ snippets/sc-shared-text/sqlite-cli-api.mdx | 9 ++++ snippets/source_connectors/sqlite.sh.mdx | 18 +++++++ snippets/source_connectors/sqlite.v2.py.mdx | 52 +++++++++++++++++++ 6 files changed, 131 insertions(+) create mode 100644 api-reference/ingest/source-connectors/sqlite.mdx create mode 100644 open-source/ingest/source-connectors/sqlite.mdx create mode 100644 snippets/sc-shared-text/sqlite-cli-api.mdx create mode 100644 snippets/source_connectors/sqlite.sh.mdx create mode 100644 snippets/source_connectors/sqlite.v2.py.mdx diff --git a/api-reference/ingest/source-connectors/sqlite.mdx b/api-reference/ingest/source-connectors/sqlite.mdx new file mode 100644 index 00000000..4181f557 --- /dev/null +++ b/api-reference/ingest/source-connectors/sqlite.mdx @@ -0,0 +1,23 @@ +--- +title: SQLite +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentSQLite from '/snippets/sc-shared-text/sqlite-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + + + + +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector: + +import SQLiteAPISh from '/snippets/source_connectors/sqlite.sh.mdx'; +import SQLiteAPIPyV2 from '/snippets/source_connectors/sqlite.v2.py.mdx'; + + + + + \ No newline at end of file diff --git a/mint.json b/mint.json index 1e705d55..670baf0f 100644 --- a/mint.json +++ b/mint.json @@ -174,6 +174,7 @@ "open-source/ingest/source-connectors/sftp", "open-source/ingest/source-connectors/sharepoint", "open-source/ingest/source-connectors/slack", + "open-source/ingest/source-connectors/sqlite", "open-source/ingest/source-connectors/wikipedia" ] }, @@ -325,6 +326,7 @@ "api-reference/ingest/source-connectors/sftp", "api-reference/ingest/source-connectors/sharepoint", "api-reference/ingest/source-connectors/slack", + "api-reference/ingest/source-connectors/sqlite", "api-reference/ingest/source-connectors/wikipedia" ] }, diff --git a/open-source/ingest/source-connectors/sqlite.mdx b/open-source/ingest/source-connectors/sqlite.mdx new file mode 100644 index 00000000..cbdbdda5 --- /dev/null +++ b/open-source/ingest/source-connectors/sqlite.mdx @@ -0,0 +1,27 @@ +--- +title: SQLite +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentSQLite from '/snippets/sc-shared-text/sqlite-cli-api.mdx'; + + + +Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The destination connector can be any of the ones supported. This example uses the local destination connector. + +This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. + +import SQLiteSh from '/snippets/source_connectors/sqlite.sh.mdx'; +import SQLitePyV2 from '/snippets/source_connectors/sqlite.v2.py.mdx'; + + + + + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + diff --git a/snippets/sc-shared-text/sqlite-cli-api.mdx b/snippets/sc-shared-text/sqlite-cli-api.mdx new file mode 100644 index 00000000..cb0c8dd5 --- /dev/null +++ b/snippets/sc-shared-text/sqlite-cli-api.mdx @@ -0,0 +1,9 @@ +Connect SQLite to your preprocessing pipeline, and use the Unstructured Ingest CLI or the Unstructured Ingest Python library to batch process all your documents and store structured outputs locally on your filesystem. + +You will need: + +import SharedSQLite from '/snippets/general-shared-text/sqlite.mdx'; +import SharedSQLiteCLIAPI from '/snippets/general-shared-text/sqlite-cli-api.mdx'; + + + \ No newline at end of file diff --git a/snippets/source_connectors/sqlite.sh.mdx b/snippets/source_connectors/sqlite.sh.mdx new file mode 100644 index 00000000..98c0acd9 --- /dev/null +++ b/snippets/source_connectors/sqlite.sh.mdx @@ -0,0 +1,18 @@ +```bash CLI +#!/usr/bin/env bash + +# Chunking and embedding are optional. + +unstructured-ingest \ + sqlite \ + --database-path $SQLITE_DB_PATH \ + --table-name elements \ + --id-column id \ + --download-dir $LOCAL_FILE_DOWNLOAD_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL +``` \ No newline at end of file diff --git a/snippets/source_connectors/sqlite.v2.py.mdx b/snippets/source_connectors/sqlite.v2.py.mdx new file mode 100644 index 00000000..2b5f81f4 --- /dev/null +++ b/snippets/source_connectors/sqlite.v2.py.mdx @@ -0,0 +1,52 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.sql.sqlite import ( + SQLiteIndexerConfig, + SQLiteDownloaderConfig, + SQLiteConnectionConfig, + SQLiteAccessConfig +) + +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig + +from unstructured_ingest.v2.processes.connectors.local import ( + LocalConnectionConfig, + LocalUploaderConfig +) + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=SQLiteIndexerConfig( + table_name="elements", + id_column="id" + ), + downloader_config=SQLiteDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")), + source_connection_config=SQLiteConnectionConfig( + access_config=SQLiteAccessConfig(), + database_path=os.getenv("SQLITE_DB_PATH") + ), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + destination_connection_config=LocalConnectionConfig(), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() +``` \ No newline at end of file