Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions api-reference/ingest/source-connectors/postgresql.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
title: PostgreSQL
---

import NewDocument from '/snippets/general-shared-text/new-document.mdx';

<NewDocument />

import SharedContentPostgreSQL from '/snippets/sc-shared-text/postgresql-cli-api.mdx';
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';

<SharedContentPostgreSQL/>
<SharedAPIKeyURL/>

Now call the Unstructured CLI or Python SDK. The destination connector can be any of the ones supported. This example uses the local destination connector:

import PostgreSQLAPISh from '/snippets/source_connectors/postgresql.sh.mdx';
import PostgreSQLAPIPyV2 from '/snippets/source_connectors/postgresql.v2.py.mdx';

<CodeGroup>
<PostgreSQLAPISh />
<PostgreSQLAPIPyV2 />
</CodeGroup>
3 changes: 3 additions & 0 deletions mint.json
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@
"open-source/ingest/source-connectors/one-drive",
"open-source/ingest/source-connectors/opensearch",
"open-source/ingest/source-connectors/outlook",
"open-source/ingest/source-connectors/postgresql",
"open-source/ingest/source-connectors/reddit",
"open-source/ingest/source-connectors/s3",
"open-source/ingest/source-connectors/salesforce",
Expand Down Expand Up @@ -319,9 +320,11 @@
"api-reference/ingest/source-connectors/one-drive",
"api-reference/ingest/source-connectors/opensearch",
"api-reference/ingest/source-connectors/outlook",
"api-reference/ingest/source-connectors/postgresql",
"api-reference/ingest/source-connectors/reddit",
"api-reference/ingest/source-connectors/s3",
"api-reference/ingest/source-connectors/salesforce",
"api-reference/ingest/source-connectors/slack",
"api-reference/ingest/source-connectors/sftp",
"api-reference/ingest/source-connectors/sharepoint",
"api-reference/ingest/source-connectors/slack",
Expand Down
28 changes: 28 additions & 0 deletions open-source/ingest/source-connectors/postgresql.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
title: PostgreSQL
---

import NewDocument from '/snippets/general-shared-text/new-document.mdx';

<NewDocument />

import SharedContentPostgreSQL from '/snippets/sc-shared-text/postgresql-cli-api.mdx';

<SharedContentPostgreSQL/>

Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector:

This example sends data to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page.

import PostgreSQLSh from '/snippets/source_connectors/postgresql.sh.mdx';
import PostgreSQLPyV2 from '/snippets/source_connectors/postgresql.v2.py.mdx';

<CodeGroup>
<PostgreSQLSh />
<PostgreSQLPyV2 />
</CodeGroup>

import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';

<SharedPartitionByAPIOSS/>

9 changes: 9 additions & 0 deletions snippets/sc-shared-text/postgresql-cli-api.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Connect PostgreSQL to your preprocessing pipeline, and use the Unstructured Ingest CLI or the Unstructured Ingest Python library to batch process all your documents and store structured outputs locally on your filesystem.

You will need:

import SharedPostgreSQL from '/snippets/general-shared-text/postgresql.mdx';
import SharedPostgreSQLCLIAPI from '/snippets/general-shared-text/postgresql-cli-api.mdx';

<SharedPostgreSQL />
<SharedPostgreSQLCLIAPI />
22 changes: 22 additions & 0 deletions snippets/source_connectors/postgresql.sh.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
```bash CLI
#!/usr/bin/env bash

# Chunking and embedding are optional.

unstructured-ingest \
postgres \
--host $PGHOST \
--port $PGPORT \
--database $PGDATABASE \
--table-name elements \
--id-column id \
--username $PGUSER \
--password $PGPASSWORD \
--download-dir $LOCAL_FILE_DOWNLOAD_DIR \
--chunking-strategy by_title \
--embedding-provider huggingface \
--output-dir $LOCAL_FILE_OUTPUT_DIR \
--partition-by-api \
--api-key $UNSTRUCTURED_API_KEY \
--partition-endpoint $UNSTRUCTURED_API_URL
```
55 changes: 55 additions & 0 deletions snippets/source_connectors/postgresql.v2.py.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
```python Python Ingest v2
import os

from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig

from unstructured_ingest.v2.processes.connectors.sql.postgres import (
PostgresIndexerConfig,
PostgresDownloaderConfig,
PostgresConnectionConfig,
PostgresAccessConfig
)

from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig

from unstructured_ingest.v2.processes.connectors.local import (
LocalConnectionConfig,
LocalUploaderConfig
)

# Chunking and embedding are optional.

if __name__ == "__main__":
Pipeline.from_configs(
context=ProcessorConfig(),
indexer_config=PostgresIndexerConfig(
table_name="elements",
id_column="id"
),
downloader_config=PostgresDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")),
source_connection_config=PostgresConnectionConfig(
access_config=PostgresAccessConfig(password=os.getenv("PGPASSWORD")),
host=os.getenv("PGHOST"),
port=os.getenv("PGPORT"),
username=os.getenv("PGUSER"),
database=os.getenv("PGDATABASE")
),
partitioner_config=PartitionerConfig(
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
additional_partition_args={
"split_pdf_page": True,
"split_pdf_allow_failed": True,
"split_pdf_concurrency_level": 15
}
),
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
destination_connection_config=LocalConnectionConfig(),
uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"))
).run()
```