diff --git a/mint.json b/mint.json index 2353fc97..0e633107 100644 --- a/mint.json +++ b/mint.json @@ -472,7 +472,6 @@ "platform/sources/overview", "platform/sources/azure-blob-storage", "platform/sources/confluence", - "platform/sources/couchbase", "platform/sources/databricks-volumes", "platform/sources/dropbox", "platform/sources/elasticsearch", @@ -501,6 +500,7 @@ "platform/destinations/mongodb", "platform/destinations/onedrive", "platform/destinations/pinecone", + "platform/destinations/postgresql", "platform/destinations/qdrant", "platform/destinations/s3", "platform/destinations/weaviate" @@ -518,7 +518,6 @@ "platform/api/sources/overview", "platform/api/sources/azure-blob-storage", "platform/api/sources/confluence", - "platform/api/sources/couchbase", "platform/api/sources/databricks-volumes", "platform/api/sources/dropbox", "platform/api/sources/elasticsearch", diff --git a/platform/api/sources/overview.mdx b/platform/api/sources/overview.mdx index 917f10b3..c52b6e30 100644 --- a/platform/api/sources/overview.mdx +++ b/platform/api/sources/overview.mdx @@ -14,7 +14,6 @@ To create a source connector, you must also provide a request body that contains For the list of specific settings, see: - [Azure](/platform/api/sources/azure-blob-storage) (`source_type=azure`) -- [Couchbase](/platform/api/sources/couchbase) (`source_type=couchbase`) - [Confluence](/platform/api/sources/confluence) (`source_type=confluence`) - [Databricks Volumes](/platform/api/sources/databricks-volumes) (`source_type=databricks_volumes`) - [Dropbox](/platform/api/sources/dropbox) (`source_type=dropbox`) diff --git a/platform/api/sources/postgresql.mdx b/platform/api/sources/postgresql.mdx new file mode 100644 index 00000000..40caacdd --- /dev/null +++ b/platform/api/sources/postgresql.mdx @@ -0,0 +1,30 @@ +--- +title: PostgreSQL +--- + +Ingest your files into Unstructured from PostgreSQL. + +The requirements are as follows. + +import PostgreSQLPrerequisites from '/snippets/general-shared-text/postgresql.mdx'; + + + +To create or change an PostgreSQL source connector, see the following examples. + +import PostgreSQLAPIRESTCreate from '/snippets/source_connectors/postgresql_rest_create.mdx'; +import PostgreSQLAPIRESTChange from '/snippets/source_connectors/postgresql_rest_change.mdx'; + + + + + + +Replace the preceding placeholders as follows: + +import PostgreSQLAPIPlaceholders from '/snippets/general-shared-text/postgresql-api-placeholders.mdx'; + + + +To change a connector, replace `` with the source connector's unique ID. +To get this ID, see [List source connectors](/platform/api/overview#list-source-connectors). \ No newline at end of file diff --git a/platform/connectors.mdx b/platform/connectors.mdx index 8c97f03e..669749df 100644 --- a/platform/connectors.mdx +++ b/platform/connectors.mdx @@ -13,7 +13,6 @@ The Unstructured Platform supports connecting to the following source and destin - [Azure](/platform/sources/azure-blob-storage) - [Confluence](/platform/sources/confluence) -- [Couchbase](/platform/sources/couchbase) - [Databricks Volumes](/platform/sources/databricks-volumes) - [Dropbox](/platform/sources/dropbox) - [Elasticsearch](/platform/sources/elasticsearch) @@ -44,6 +43,7 @@ If your source is not listed here, you might still be able to connect Unstructur - [MongoDB](/platform/destinations/mongodb) - [OneDrive](/platform/destinations/onedrive) - [Pinecone](/platform/destinations/pinecone) +- [PostgreSQL](/platform/destinations/postgresql) - [Qdrant](/platform/destinations/qdrant) - [S3](/platform/destinations/s3) - [Weaviate](/platform/destinations/weaviate) diff --git a/platform/destinations/overview.mdx b/platform/destinations/overview.mdx index aaedf126..0bc5e40f 100644 --- a/platform/destinations/overview.mdx +++ b/platform/destinations/overview.mdx @@ -28,6 +28,7 @@ To create a destination connector: - [MongoDB](/platform/destinations/mongodb) - [OneDrive](/platform/destinations/onedrive) - [Pinecone](/platform/destinations/pinecone) + - [PostgreSQL](/platform/destinations/postgresql) - [Qdrant](/platform/destinations/qdrant) - [S3](/platform/destinations/s3) - [Weaviate](/platform/destinations/weaviate) diff --git a/platform/destinations/postgresql.mdx b/platform/destinations/postgresql.mdx index 8c96f4a5..a6861935 100644 --- a/platform/destinations/postgresql.mdx +++ b/platform/destinations/postgresql.mdx @@ -1,46 +1,26 @@ --- title: PostgreSQL -description: This page contains the information to store processed data to a PostgreSQL database. --- -## Prerequisites - -* PostgreSQL Server Hostname - -* Database Name and Port Number - -* Username and Password for Database Access - - -For more information, please refer to [PostgreSQL documentation](https://www.postgresql.org/docs/). - - - -Ensure that the index schema is compatible with the data you intend to write. If you need guidance on structuring your schema, consult the [Sample Index Schema](/open-source/ingest/destination-connectors/sql#sample-index-schema) for reference. - - -## Step-by-Step Guide - -![Destination Connector PostgreSQL](/img/platform/Destination-PostgreSQL.png) - -1. **Access the Create Destination Page**. Navigate to the “Destinations” section within the platform’s side navigation menu and click on “New Destination” to initiate the setup of a new destination for your processed data. - -2. **Select Destination Type**. Select **PostgreSQL** destination connector from the `Type` dropdown menu. - -3. **Configure Destination Details** - - -* `Name` (_required_): Assign a descriptive name to the new destination connector. - -* `Host` (_required_): Enter the hostname or IP address of the PostgreSQL server. - -* `Database` (_required_): Provide the name of the PostgreSQL database. - -* `Port`: Specify the port number for the PostgreSQL server (default is 5432). - -* `Username`: Input the username for the PostgreSQL database access. - -* `Password`: Enter the password associated with the username. - - -4. **Submit**. Review all the details entered to ensure accuracy. Click ‘Submit’ to finalize the creation of the Destination Connector. The newly completed PostgreSQL connector will be listed on the Destinations dashboard. \ No newline at end of file +Send processed data from Unstructured to PostgreSQL. + +The requirements are as follows. + +import PostgreSQLPrerequisites from '/snippets/general-shared-text/postgresql.mdx'; + + + +To create the destination connector: + +1. On the sidebar, click **Connectors**. +2. Click **Destinations**. +3. Cick **New** or **Create Connector**. +4. Give the connector some unique **Name**. +5. In the **Provider** area, click **PostgreSQL**. +6. Click **Continue**. +7. Follow the on-screen instructions to fill in the fields as described later on this page. +8. Click **Save and Test**. + +import PostgreSQLFields from '/snippets/general-shared-text/postgresql-platform.mdx'; + + \ No newline at end of file diff --git a/platform/sources/overview.mdx b/platform/sources/overview.mdx index ad6690c4..b705718c 100644 --- a/platform/sources/overview.mdx +++ b/platform/sources/overview.mdx @@ -20,7 +20,6 @@ To create a source connector: - [Azure](/platform/sources/azure-blob-storage) - [Confluence](/platform/sources/confluence) - - [Couchbase](/platform/sources/couchbase) - [Databricks Volumes](/platform/sources/databricks-volumes) - [Dropbox](/platform/sources/dropbox) - [Elasticsearch](/platform/sources/elasticsearch) diff --git a/platform/sources/postgresql.mdx b/platform/sources/postgresql.mdx new file mode 100644 index 00000000..405da5b6 --- /dev/null +++ b/platform/sources/postgresql.mdx @@ -0,0 +1,26 @@ +--- +title: PostgreSQL +--- + +Ingest your files into Unstructured from PostgreSQL. + +The requirements are as follows. + +import PostgreSQLPrerequisites from '/snippets/general-shared-text/postgresql.mdx'; + + + +To create the source connector: + +1. On the sidebar, click **Connectors**. +2. Click **Sources**. +3. Cick **New** or **Create Connector**. +4. Give the connector some unique **Name**. +5. In the **Provider** area, click **PostgreSQL**. +6. Click **Continue**. +7. Follow the on-screen instructions to fill in the fields as described later on this page. +8. Click **Save and Test**. + +import PostgreSQLFields from '/snippets/general-shared-text/postgresql-platform.mdx'; + + \ No newline at end of file diff --git a/snippets/destination_connectors/postgresql_rest_change.mdx b/snippets/destination_connectors/postgresql_rest_change.mdx index a8c2141c..9a841d86 100644 --- a/snippets/destination_connectors/postgresql_rest_change.mdx +++ b/snippets/destination_connectors/postgresql_rest_change.mdx @@ -13,7 +13,7 @@ curl --request 'PUT' --location \ "username": "", "password": "", "table_name": "", - "batch_size": "" + "batch_size": } }' ``` \ No newline at end of file diff --git a/snippets/destination_connectors/postgresql_rest_create.mdx b/snippets/destination_connectors/postgresql_rest_create.mdx index 46c1d070..fd5c1960 100644 --- a/snippets/destination_connectors/postgresql_rest_create.mdx +++ b/snippets/destination_connectors/postgresql_rest_create.mdx @@ -15,7 +15,7 @@ curl --request 'POST' --location \ "username": "", "password": "", "table_name": "", - "batch_size": "" + "batch_size": } }' ``` \ No newline at end of file diff --git a/snippets/general-shared-text/postgresql-api-placeholders.mdx b/snippets/general-shared-text/postgresql-api-placeholders.mdx index 8dc7c998..d6fedb4e 100644 --- a/snippets/general-shared-text/postgresql-api-placeholders.mdx +++ b/snippets/general-shared-text/postgresql-api-placeholders.mdx @@ -5,4 +5,6 @@ - `` (required) - The username. - `` (required) - The user's password. - `` (required) - The name of the table in the database. -- `` - The maximum number of rows to transmit at a time. The default is `100` if not otherwise specified. \ No newline at end of file +- `` - The maximum number of rows to transmit at a time. The default is `100` if not otherwise specified. +- `` (required, source connector only) - The name of the ID column in the table. +- For `fields` (source connector only), set one or more `` values, with each value representing the name of a column to process (including the specified `` column). The default is all columns if not otherwise specified. \ No newline at end of file diff --git a/snippets/general-shared-text/postgresql-platform.mdx b/snippets/general-shared-text/postgresql-platform.mdx new file mode 100644 index 00000000..f8b1f719 --- /dev/null +++ b/snippets/general-shared-text/postgresql-platform.mdx @@ -0,0 +1,12 @@ +Fill in the following fields: + +- **Name** (_required_): A unique name for this connector. +- **Host** (_required_): The host name of the target PostgreSQL instance. +- **DB Name** (_required_): The name of the target database on the instance. +- **Port** (_required_): The port number on the instance. +- **Username** (_required_): The name of the target user with the appropriate access to the instance. +- **Password** (_required_): The password for the user. +- **Table Name** (_required_): The name of the target table in the database. +- **Batch Size**: The maximum number of rows to transmit at a time. The default is `100` if not otherwise specified. +- **ID Column** (_required_, source connector only): The name of the ID column in the table. +- **Columns** (_required_, source connector only): A comma separated list of column names to process (including the specified **ID Column** column). The default is all columns if not otherwise specified. \ No newline at end of file diff --git a/snippets/general-shared-text/postgresql.mdx b/snippets/general-shared-text/postgresql.mdx index 313c5085..84e38771 100644 --- a/snippets/general-shared-text/postgresql.mdx +++ b/snippets/general-shared-text/postgresql.mdx @@ -1,4 +1,7 @@ -The following video shows for example how to get these settings by using [Amazon RDS for PostgreSQL](https://aws.amazon.com/rds/postgresql/): +- For the [Unstructured Platform](/platform/overview), local PostgreSQL installations are not supported. +- For [Unstructured Ingest](/ingestion/overview), local and non-local PostgreSQL installations are supported. + +The following video shows how to set up [Amazon RDS for PostgreSQL](https://aws.amazon.com/rds/postgresql/): + import AllowIPAddressRanges from '/snippets/general-shared-text/ip-address-ranges.mdx'; -- A PostgreSQL instance. [Install PostgreSQL](https://www.postgresql.org/docs/current/tutorial-install.html). -- The host name and port number for the instance. These values are in the `postgresql.conf` file's `listen_addresses` and `port` settings. This file should be on the same machine as the instance. These values might also already be set as environment variables named `PGHOST` and `PGPORT` on the same machine as the instance. +- A PostgreSQL instance. + + - [Create an Amazon RDS for PostgreSQL instance](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/CHAP_GettingStarted.CreatingConnecting.PostgreSQL.html). + - [Create an Azure Database for PostgreSQL server](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/how-to-deploy-on-azure-free-account). + - [Install PostgreSQL locally](https://www.postgresql.org/docs/current/tutorial-install.html). + +- The host name and port number for the instance. + + - For Amazon RDS for PostgreSQL, learn how to [get the host name and port number](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_ConnectToPostgreSQLInstance.html#postgresql-endpoint). + - For Azure Database for PostgreSQL, learn how to [get the host](https://learn.microsoft.com/azure/postgresql/flexible-server/quickstart-create-server-portal#connect-to-the-azure-database-for-postgresql-flexible-server-database-using-psql). The port number is `5432`. + - For local PostgreSQL installations, these values are in the `postgresql.conf` file's `listen_addresses` and `port` settings. This file should be on the same machine as the instance. These values might also already be set as environment variables named `PGHOST` and `PGPORT` on the same machine as the instance. + - For other installation types, see your PostgreSQL provider's documentation. + - Depending on your network security requirements, you might need to allow access to your instance only from specific IP addresses. To learn how to allow these IP address ranges, see your PostgreSQL provider's documentation, for example with - [Amazon RDS for PostgreSQL](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Overview.RDSSecurityGroups.html). + [Amazon RDS for PostgreSQL](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Overview.RDSSecurityGroups.html) or + [Azure Database for PostgreSQL](https://learn.microsoft.com/azure/postgresql/flexible-server/how-to-manage-firewall-portal#create-a-firewall-rule-after-server-is-created). -- A database in the instance. [Create a database](https://www.postgresql.org/docs/current/tutorial-createdb.html). -- A table in the database. [Create a table](https://www.postgresql.org/docs/current/tutorial-table.html). +- A database in the instance. + + - For Amazon RDS for PostgreSQL and Azure Database for PostgreSQL, the default database name is `postgres` unless a custom database name was specified during the instance creation process. + - For local PostgreSQL installations, learn how to [create a database](https://www.postgresql.org/docs/current/tutorial-createdb.html). + - For other installation types, see your PostgreSQL provider's documentation. + +- A table in the database. Learn how to [create a table](https://www.postgresql.org/docs/current/tutorial-table.html). The table's schema must match the schema of the documents that Unstructured produces. Unstructured cannot provide a schema that is guaranteed to work in all @@ -37,6 +69,7 @@ import AllowIPAddressRanges from '/snippets/general-shared-text/ip-address-range id UUID PRIMARY KEY, record_id VARCHAR, element_id VARCHAR, + record_id TEXT, text TEXT, embeddings DECIMAL [], parent_id VARCHAR, @@ -53,8 +86,9 @@ import AllowIPAddressRanges from '/snippets/general-shared-text/ip-address-range id UUID PRIMARY KEY, record_id VARCHAR, element_id VARCHAR, + record_id TEXT, text TEXT, - embeddings vector(384), + embeddings vector(3072), parent_id VARCHAR, page_number INTEGER, is_continuation BOOLEAN, @@ -69,6 +103,16 @@ import AllowIPAddressRanges from '/snippets/general-shared-text/ip-address-range - [CREATE TABLE](https://github.com/pgvector/pgvector) for PostrgreSQL with pgvector - [Unstructured document elements and metadata](/api-reference/api-services/document-elements) -- A user in the database, and a password for the user. [Create a user](https://www.postgresql.org/docs/current/sql-createuser.html). -- Database access for the user. [Give database access to a user](https://www.postgresql.org/docs/current/sql-grant.html). +- A user in the database, and a password for the user. + + - For Amazon RDS for PostgreSQL, learn how to [create a user](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Roles.html). + - For Azure Database for PostgreSQL, learn how to [create a user](https://learn.microsoft.com/azure/postgresql/flexible-server/how-to-create-users). + - For local PostgreSQL installations, learn how to [create a user](https://www.postgresql.org/docs/current/sql-createuser.html). + - For other installation types, see your PostgreSQL provider's documentation. + +- Database access for the user. + - For Amazon RDS for PostgreSQL, learn how to [control user access](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.Access.html). + - For Azure Database for PostgreSQL, learn how to [control user access](https://www.postgresql.org/docs/current/sql-createuser.html). + - For local PostgreSQL installations, learn how to [give database access to a user](https://www.postgresql.org/docs/current/sql-grant.html). + - For other installation types, see your PostgreSQL provider's documentation. \ No newline at end of file diff --git a/snippets/source_connectors/postgresql_rest_change.mdx b/snippets/source_connectors/postgresql_rest_change.mdx new file mode 100644 index 00000000..331eaa4a --- /dev/null +++ b/snippets/source_connectors/postgresql_rest_change.mdx @@ -0,0 +1,24 @@ +```bash REST (Change) +curl --request 'PUT' --location \ +"$UNSTRUCTURED_API_URL/destinations" \ +--header 'accept: application/json' \ +--header "unstructured-api-key: $UNSTRUCTURED_API_KEY" \ +--header 'content-type: application/json' \ +--data \ +'{ + "config": { + "host": "", + "database": "", + "port": "", + "username": "", + "password": "", + "table_name": "", + "batch_size": , + "id_column": "", + "fields": [ + "", + "" + ] + } +}' +``` \ No newline at end of file diff --git a/snippets/source_connectors/postgresql_rest_create.mdx b/snippets/source_connectors/postgresql_rest_create.mdx new file mode 100644 index 00000000..7562cc84 --- /dev/null +++ b/snippets/source_connectors/postgresql_rest_create.mdx @@ -0,0 +1,26 @@ +```bash REST (Create) +curl --request 'POST' --location \ +"$UNSTRUCTURED_API_URL/destinations" \ +--header 'accept: application/json' \ +--header "unstructured-api-key: $UNSTRUCTURED_API_KEY" \ +--header 'content-type: application/json' \ +--data \ +'{ + "name": "", + "type": "postgres", + "config": { + "host": "", + "database": "", + "port": "", + "username": "", + "password": "", + "table_name": "", + "batch_size": , + "id_column": "", + "fields": [ + "", + "" + ] + } +}' +``` \ No newline at end of file