diff --git a/api-reference/ingest/destination-connector/neo4j.mdx b/api-reference/ingest/destination-connector/neo4j.mdx new file mode 100644 index 00000000..5fbade58 --- /dev/null +++ b/api-reference/ingest/destination-connector/neo4j.mdx @@ -0,0 +1,30 @@ +--- +title: Neo4j +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentNeo4j from '/snippets/dc-shared-text/neo4j-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + + + + +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector: + +import Neo4jAPISh from '/snippets/destination_connectors/neo4j.sh.mdx'; +import Neo4jAPIPyV2 from '/snippets/destination_connectors/neo4j.v2.py.mdx'; + + + + + + +## Graph Output + +import Neo4jGraphFormat from '/snippets/general-shared-text/neo4j-graph.mdx'; + + + diff --git a/api-reference/ingest/ingest-dependencies.mdx b/api-reference/ingest/ingest-dependencies.mdx index fd4841c9..c80c78a7 100644 --- a/api-reference/ingest/ingest-dependencies.mdx +++ b/api-reference/ingest/ingest-dependencies.mdx @@ -71,6 +71,7 @@ To add support for additional connectors, run the following: | `pip install "unstructured-ingest[kafka]"` | Apache Kafka | | `pip install "unstructured-ingest[milvus]"` | Milvus | | `pip install "unstructured-ingest[mongodb]"` | MongoDB | +| `pip install "unstructured-ingest[neo4j]"` | Neo4j | | `pip install "unstructured-ingest[notion]"` | Notion | | `pip install "unstructured-ingest[onedrive]"` | OneDrive | | `pip install "unstructured-ingest[opensearch]"` | OpenSearch | diff --git a/mint.json b/mint.json index 788d43ef..b999a3d3 100644 --- a/mint.json +++ b/mint.json @@ -223,6 +223,7 @@ "open-source/ingest/destination-connectors/milvus", "open-source/ingest/destination-connectors/mongodb", "open-source/ingest/destination-connectors/motherduck", + "open-source/ingest/destination-connectors/neo4j", "open-source/ingest/destination-connectors/onedrive", "open-source/ingest/destination-connectors/opensearch", "open-source/ingest/destination-connectors/pinecone", @@ -384,6 +385,7 @@ "api-reference/ingest/destination-connector/milvus", "api-reference/ingest/destination-connector/mongodb", "api-reference/ingest/destination-connector/motherduck", + "api-reference/ingest/destination-connector/neo4j", "api-reference/ingest/destination-connector/onedrive", "api-reference/ingest/destination-connector/opensearch", "api-reference/ingest/destination-connector/pinecone", diff --git a/open-source/ingest/destination-connectors/neo4j.mdx b/open-source/ingest/destination-connectors/neo4j.mdx new file mode 100644 index 00000000..ef8443b0 --- /dev/null +++ b/open-source/ingest/destination-connectors/neo4j.mdx @@ -0,0 +1,33 @@ +--- +title: Neo4j +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedNeo4j from '/snippets/dc-shared-text/neo4j-cli-api.mdx'; + + + +Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector. + +This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. + +import Neo4jAPISh from '/snippets/destination_connectors/neo4j.sh.mdx'; +import Neo4jAPIPyV2 from '/snippets/destination_connectors/neo4j.v2.py.mdx'; + + + + + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + + +## Graph Output + +import Neo4jGraphFormat from '/snippets/general-shared-text/neo4j-graph.mdx'; + + diff --git a/snippets/dc-shared-text/neo4j-cli-api.mdx b/snippets/dc-shared-text/neo4j-cli-api.mdx new file mode 100644 index 00000000..5c5422bb --- /dev/null +++ b/snippets/dc-shared-text/neo4j-cli-api.mdx @@ -0,0 +1,9 @@ +Batch process all your records to store structured outputs in a Neo4j account. + +The requirements are as follows. + +import SharedNeo4j from '/snippets/general-shared-text/neo4j.mdx'; +import SharedNeo4jCLIAPI from '/snippets/general-shared-text/neo4j-cli-api.mdx'; + + + diff --git a/snippets/destination_connectors/neo4j.sh.mdx b/snippets/destination_connectors/neo4j.sh.mdx new file mode 100644 index 00000000..46a580ca --- /dev/null +++ b/snippets/destination_connectors/neo4j.sh.mdx @@ -0,0 +1,22 @@ +```bash CLI +#!/usr/bin/env bash + +# Chunking and embedding are optional. + +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --strategy hi_res \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + neo4j \ + --username $NEO4J_USERNAME \ + --password $NEO4J_PASSWORD \ + --uri $NEO4J_URI \ # ://: + --database $NEO4J_DATABASE \ + --batch-size 100 +``` \ No newline at end of file diff --git a/snippets/destination_connectors/neo4j.v2.py.mdx b/snippets/destination_connectors/neo4j.v2.py.mdx new file mode 100644 index 00000000..b390a46c --- /dev/null +++ b/snippets/destination_connectors/neo4j.v2.py.mdx @@ -0,0 +1,51 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.neo4j import ( + Neo4jAccessConfig, + Neo4jConnectionConfig, + Neo4jUploadStagerConfig, + Neo4jUploaderConfig +) +from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalConnectionConfig, + LocalDownloaderConfig +) +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + destination_connection_config=Neo4jConnectionConfig( + access_config=Neo4jAccessConfig(password=os.getenv("NEO4J_PASSWORD")), + username=os.getenv("NEO4J_USERNAME"), + uri=os.getenv("NEO4J_URI"), + database=os.getenv("NEO4J_DATABASE"), + ), + stager_config=Neo4jUploadStagerConfig(), + uploader_config=Neo4jUploaderConfig(batch_size=100) + ).run() +``` \ No newline at end of file diff --git a/snippets/general-shared-text/neo4j-cli-api.mdx b/snippets/general-shared-text/neo4j-cli-api.mdx new file mode 100644 index 00000000..6e84be2e --- /dev/null +++ b/snippets/general-shared-text/neo4j-cli-api.mdx @@ -0,0 +1,16 @@ +The Neo4j connector dependencies: + +```bash CLI, Python +pip install "unstructured-ingest[neo4j]" +``` + +import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx'; + + + +The following environment variables: + +- `NEO4J_USERNAME` - The name of the target user with access to the target Neo4j deployment, represented by `--username` (CLI) or `username` (Python). +- `NEO4J_PASSWORD` - The user's password, represented by `--password` (CLI) or `password` (Python). +- `NEO4J_URI` - The connection URI for the deployment, represented by `--uri` (CLI) or `uri` (Python). +- `NEO4J_DATABASE` - The name of the database in the deployment, represented by `--database` (CLI) or `database` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/neo4j-graph.mdx b/snippets/general-shared-text/neo4j-graph.mdx new file mode 100644 index 00000000..fe1d43e9 --- /dev/null +++ b/snippets/general-shared-text/neo4j-graph.mdx @@ -0,0 +1,64 @@ +The graph ouput of the Neo4j destination connector is represented in the following diagram: + +```mermaid +graph BT + subgraph dn [Document Node] + D[Document] + end + style dn stroke-dasharray: 5 + + subgraph en [Element Nodes] + UE1[UnstructuredElement] + UE2[UnstructuredElement] + UE3[UnstructuredElement] + UE4[UnstructuredElement] + UE5[UnstructuredElement] + UE6[UnstructuredElement] + end + style en stroke-dasharray: 5 + + UE1 -->|PART_OF_DOCUMENT| D + UE2 -->|PART_OF_DOCUMENT| D + UE3 -->|PART_OF_DOCUMENT| D + UE4 -->|PART_OF_DOCUMENT| D + UE5 -->|PART_OF_DOCUMENT| D + UE6 -->|PART_OF_DOCUMENT| D + + subgraph cn [Chunk Nodes] + C1[Chunk] + C2[Chunk] + C3[Chunk] + C4[Chunk] + end + style cn stroke-dasharray: 5 + + C1 -->|NEXT_CHUNK| C2 + C2 -->|NEXT_CHUNK| C3 + C3 -->|NEXT_CHUNK| C4 + + C1 -->|PART_OF_DOCUMENT| D + C2 -->|PART_OF_DOCUMENT| D + C3 -->|PART_OF_DOCUMENT| D + C4 -->|PART_OF_DOCUMENT| D + + UE1 -.->|PART_OF_CHUNK| C1 + UE2 -.->|PART_OF_CHUNK| C1 + UE3 -.->|PART_OF_CHUNK| C2 + UE4 -.->|PART_OF_CHUNK| C3 + UE5 -.->|PART_OF_CHUNK| C4 + UE6 -.->|PART_OF_CHUNK| C4 +``` + +[View the preceding diagram in full-screen mode](https://mermaid.live/view#pako:eNqFlN9vgjAQx_-Vps-6REEfeFiyFZYli7hskCyTxXS0ihFaU9oHo_7vq_IjgIzyxN330157d70TjDmh0IFbgQ8JeA4iBvSXq9_CQRhYuTxWGWUS-Br9KQC39pYOyki5VB5Tel2XS8H3dExwnmAh8NEBs4LohKA6hJfSOkJe7hh6k1XI9C4qlkpQUjK1Oh1UrUHVHlRng-p8QO1kgRqzoC8JxuPH8_vTR7BevqzdJQoXnh-cgVvf0wRYJsA2ATMTMP8f6FQz1tVEiWL7Vi3RpHBW5rRtWm3TbpmdnMbGnKIipb73FazRa-i_nXXAKvC9ZFWHuJfs6nrIUCVkKBIy1AjZpgTfGuWhwVRnnDT6ZFC3-vVpo0v6dKvRJH263eiRXh2OYEZFhndEj5nTlY6gTPSriaCjfwndYJXKCEbsolGsJP88shg6-onRERRcbRPobHCaa0sdCJbU3WHdbFmFHDD75jyrIUp2kotFMddu4-3yB3k-fcg). + +In the preceding diagram: + +- The `Document` node represents the source file. +- The `UnstructuredElement` nodes represent the source file's Unstructured `Element` objects, before chunking. +- The `Chunk` nodes represent the source file's Unstructured `Element` objects, after chunking. +- Each `UnstructuredElement` node has a `PART_OF_DOCUMENT` relationship with the `Document` node. +- Each `Chunk` node also has a `PART_OF_DOCUMENT` relationship with the `Document` node. +- Each `UnstructuredElement` node has a `PART_OF_CHUNK` relationship with a `Chunk` element. +- Each `Chunk` node, except for the "last" `Chunk` node, has a `NEXT_CHUNK` relationship with its "next" `Chunk` node. + +Learn more about [document elements](/platform/document-elements) and [chunking](/platform/chunking). \ No newline at end of file diff --git a/snippets/general-shared-text/neo4j.mdx b/snippets/general-shared-text/neo4j.mdx new file mode 100644 index 00000000..ebeb1020 --- /dev/null +++ b/snippets/general-shared-text/neo4j.mdx @@ -0,0 +1,28 @@ +- A [Neo4j deployment](https://neo4j.com/deployment-center/). +- The username and password for the user who has access to the Neo4j deployment. The default user is typically `neo4j`. + + - For a Neo4j AuraDB instance, the defaut user's is typically set when the instance is created. + - For an AWS Marketplace, Microsoft Azure Marketplace, or Google Cloud Marketplace deployment of Neo4j, the default user is typically set during the deployment process. + - For a local Neo4j deployment, you can [set the default user's initial password](https://neo4j.com/docs/operations-manual/current/configuration/set-initial-password/) or [recover an admin user and its password](https://neo4j.com/docs/operations-manual/current/authentication-authorization/password-and-user-recovery/). + +- The connection URI for the Neo4j deployment, which starts with `neo4j://`, `neo4j+s://`, `bolt://`, or `bolt+s://`; followed by `localhost` or the host name; and sometimes ending with a colon and the port number (such as `:7687`). For example: + + - For a Neo4j AuraDB deployment, browse to the target Neo4j instance in the Neo4j Aura account and click **Connect > Drivers** to get the connection URI, which follows the format `neo4j+s://`. A port number is not used or needed. + - For an AWS Marketplace, Microsoft Azure Marketplace, or Google Cloud Marketplace deployment of Neo4j, see + [Neo4j on AWS](https://neo4j.com/docs/operations-manual/current/cloud-deployments/neo4j-aws/), + [Neo4j on Azure](https://neo4j.com/docs/operations-manual/current/cloud-deployments/neo4j-azure/), or + [Neo4j on GCP](https://neo4j.com/docs/operations-manual/current/cloud-deployments/neo4j-gcp/) + for details about how to get the connection URI. + - For a local Neo4j deployment, the URI is typically `bolt://localhost:7687` + - For other Neo4j deployment types, see the deployment provider's documentation. + + [Learn more](https://neo4j.com/docs/browser-manual/current/operations/dbms-connection). + +- The name of the target database in the Neo4j deployment. A default Neo4j deployment typically contains two standard databases: one named `neo4j` for user data and another + named `system` for system data and metadata. Some Neo4j deployment types support more than these two databases per deployment; + Neo4j AuraDB instances do not. + + - [Create additional databases](https://neo4j.com/docs/operations-manual/current/database-administration/standard-databases/create-databases/) + for a local Neo4j deployment that uses Enterprise Edition; or for Neo4j on AWS, Neo4j on Azure, or Neo4j on GCP deployments. + - [Get a list of additional available databases](https://neo4j.com/docs/operations-manual/current/database-administration/standard-databases/listing-databases/) + for a local Neo4j deployment that uses Enterprise Edition; or for Neo4j on AWS, Neo4j on Azure, or Neo4j on GCP deployments. \ No newline at end of file