From d86199b7e97c574d2a1c2acdab45c37298a516aa Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 24 Oct 2024 09:59:01 -0700 Subject: [PATCH] PostgreSQL v2 source connector --- .../ingest/source-connectors/postgresql.mdx | 23 ++++++++ mint.json | 3 + .../ingest/source-connectors/postgresql.mdx | 28 ++++++++++ .../sc-shared-text/postgresql-cli-api.mdx | 9 +++ snippets/source_connectors/postgresql.sh.mdx | 22 ++++++++ .../source_connectors/postgresql.v2.py.mdx | 55 +++++++++++++++++++ 6 files changed, 140 insertions(+) create mode 100644 api-reference/ingest/source-connectors/postgresql.mdx create mode 100644 open-source/ingest/source-connectors/postgresql.mdx create mode 100644 snippets/sc-shared-text/postgresql-cli-api.mdx create mode 100644 snippets/source_connectors/postgresql.sh.mdx create mode 100644 snippets/source_connectors/postgresql.v2.py.mdx diff --git a/api-reference/ingest/source-connectors/postgresql.mdx b/api-reference/ingest/source-connectors/postgresql.mdx new file mode 100644 index 00000000..60f5cea5 --- /dev/null +++ b/api-reference/ingest/source-connectors/postgresql.mdx @@ -0,0 +1,23 @@ +--- +title: PostgreSQL +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentPostgreSQL from '/snippets/sc-shared-text/postgresql-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + + + + +Now call the Unstructured CLI or Python SDK. The destination connector can be any of the ones supported. This example uses the local destination connector: + +import PostgreSQLAPISh from '/snippets/source_connectors/postgresql.sh.mdx'; +import PostgreSQLAPIPyV2 from '/snippets/source_connectors/postgresql.v2.py.mdx'; + + + + + \ No newline at end of file diff --git a/mint.json b/mint.json index 1e705d55..08b97b67 100644 --- a/mint.json +++ b/mint.json @@ -168,6 +168,7 @@ "open-source/ingest/source-connectors/one-drive", "open-source/ingest/source-connectors/opensearch", "open-source/ingest/source-connectors/outlook", + "open-source/ingest/source-connectors/postgresql", "open-source/ingest/source-connectors/reddit", "open-source/ingest/source-connectors/s3", "open-source/ingest/source-connectors/salesforce", @@ -319,9 +320,11 @@ "api-reference/ingest/source-connectors/one-drive", "api-reference/ingest/source-connectors/opensearch", "api-reference/ingest/source-connectors/outlook", + "api-reference/ingest/source-connectors/postgresql", "api-reference/ingest/source-connectors/reddit", "api-reference/ingest/source-connectors/s3", "api-reference/ingest/source-connectors/salesforce", + "api-reference/ingest/source-connectors/slack", "api-reference/ingest/source-connectors/sftp", "api-reference/ingest/source-connectors/sharepoint", "api-reference/ingest/source-connectors/slack", diff --git a/open-source/ingest/source-connectors/postgresql.mdx b/open-source/ingest/source-connectors/postgresql.mdx new file mode 100644 index 00000000..e25842f6 --- /dev/null +++ b/open-source/ingest/source-connectors/postgresql.mdx @@ -0,0 +1,28 @@ +--- +title: PostgreSQL +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentPostgreSQL from '/snippets/sc-shared-text/postgresql-cli-api.mdx'; + + + +Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector: + +This example sends data to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. + +import PostgreSQLSh from '/snippets/source_connectors/postgresql.sh.mdx'; +import PostgreSQLPyV2 from '/snippets/source_connectors/postgresql.v2.py.mdx'; + + + + + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + + diff --git a/snippets/sc-shared-text/postgresql-cli-api.mdx b/snippets/sc-shared-text/postgresql-cli-api.mdx new file mode 100644 index 00000000..f9486f50 --- /dev/null +++ b/snippets/sc-shared-text/postgresql-cli-api.mdx @@ -0,0 +1,9 @@ +Connect PostgreSQL to your preprocessing pipeline, and use the Unstructured Ingest CLI or the Unstructured Ingest Python library to batch process all your documents and store structured outputs locally on your filesystem. + +You will need: + +import SharedPostgreSQL from '/snippets/general-shared-text/postgresql.mdx'; +import SharedPostgreSQLCLIAPI from '/snippets/general-shared-text/postgresql-cli-api.mdx'; + + + diff --git a/snippets/source_connectors/postgresql.sh.mdx b/snippets/source_connectors/postgresql.sh.mdx new file mode 100644 index 00000000..6abf59d7 --- /dev/null +++ b/snippets/source_connectors/postgresql.sh.mdx @@ -0,0 +1,22 @@ +```bash CLI +#!/usr/bin/env bash + +# Chunking and embedding are optional. + +unstructured-ingest \ + postgres \ + --host $PGHOST \ + --port $PGPORT \ + --database $PGDATABASE \ + --table-name elements \ + --id-column id \ + --username $PGUSER \ + --password $PGPASSWORD \ + --download-dir $LOCAL_FILE_DOWNLOAD_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL +``` \ No newline at end of file diff --git a/snippets/source_connectors/postgresql.v2.py.mdx b/snippets/source_connectors/postgresql.v2.py.mdx new file mode 100644 index 00000000..e0cf9408 --- /dev/null +++ b/snippets/source_connectors/postgresql.v2.py.mdx @@ -0,0 +1,55 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.sql.postgres import ( + PostgresIndexerConfig, + PostgresDownloaderConfig, + PostgresConnectionConfig, + PostgresAccessConfig +) + +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig + +from unstructured_ingest.v2.processes.connectors.local import ( + LocalConnectionConfig, + LocalUploaderConfig +) + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=PostgresIndexerConfig( + table_name="elements", + id_column="id" + ), + downloader_config=PostgresDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")), + source_connection_config=PostgresConnectionConfig( + access_config=PostgresAccessConfig(password=os.getenv("PGPASSWORD")), + host=os.getenv("PGHOST"), + port=os.getenv("PGPORT"), + username=os.getenv("PGUSER"), + database=os.getenv("PGDATABASE") + ), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + destination_connection_config=LocalConnectionConfig(), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() +``` \ No newline at end of file