diff --git a/api-reference/ingest/destination-connector/duckdb.mdx b/api-reference/ingest/destination-connector/duckdb.mdx
new file mode 100644
index 00000000..897beda0
--- /dev/null
+++ b/api-reference/ingest/destination-connector/duckdb.mdx
@@ -0,0 +1,24 @@
+---
+title: DuckDB
+---
+
+import NewDocument from '/snippets/general-shared-text/new-document.mdx';
+
+
+
+import SharedContentDuckDB from '/snippets/dc-shared-text/duckdb-cli-api.mdx';
+import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
+
+
+
+
+Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector:
+
+import DuckDBAPISh from '/snippets/destination_connectors/duckdb.sh.mdx';
+import DuckDBAPIPyV2 from '/snippets/destination_connectors/duckdb.v2.py.mdx';
+
+
+
+
+
+
diff --git a/api-reference/ingest/destination-connector/motherduck.mdx b/api-reference/ingest/destination-connector/motherduck.mdx
new file mode 100644
index 00000000..997f889f
--- /dev/null
+++ b/api-reference/ingest/destination-connector/motherduck.mdx
@@ -0,0 +1,24 @@
+---
+title: MotherDuck
+---
+
+import NewDocument from '/snippets/general-shared-text/new-document.mdx';
+
+
+
+import SharedContentMotherDuck from '/snippets/dc-shared-text/motherduck-cli-api.mdx';
+import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
+
+
+
+
+Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector:
+
+import MotherDuckAPISh from '/snippets/destination_connectors/motherduck.sh.mdx';
+import MotherDuckAPIPyV2 from '/snippets/destination_connectors/motherduck.v2.py.mdx';
+
+
+
+
+
+
diff --git a/api-reference/ingest/ingest-dependencies.mdx b/api-reference/ingest/ingest-dependencies.mdx
index de63557e..fd4841c9 100644
--- a/api-reference/ingest/ingest-dependencies.mdx
+++ b/api-reference/ingest/ingest-dependencies.mdx
@@ -60,6 +60,7 @@ To add support for additional connectors, run the following:
| `pip install "unstructured-ingest[delta-table]"` | Delta Tables |
| `pip install "unstructured-ingest[discord]"` | Discord |
| `pip install "unstructured-ingest[dropbox]"` | Dropbox |
+| `pip install "unstructured-ingest[dropbox]"` | DuckDB, MotherDuck |
| `pip install "unstructured-ingest[elasticsearch]"` | Elasticsearch |
| `pip install "unstructured-ingest[gcs]"` | Google Cloud Storage |
| `pip install "unstructured-ingest[github]"` | GitHub |
diff --git a/mint.json b/mint.json
index 0e633107..51d3789c 100644
--- a/mint.json
+++ b/mint.json
@@ -213,6 +213,7 @@
"open-source/ingest/destination-connectors/databricks-volumes",
"open-source/ingest/destination-connectors/delta-table",
"open-source/ingest/destination-connectors/dropbox",
+ "open-source/ingest/destination-connectors/duckdb",
"open-source/ingest/destination-connectors/elasticsearch",
"open-source/ingest/destination-connectors/google-cloud-service",
"open-source/ingest/destination-connectors/kafka",
@@ -221,6 +222,7 @@
"open-source/ingest/destination-connectors/local",
"open-source/ingest/destination-connectors/milvus",
"open-source/ingest/destination-connectors/mongodb",
+ "open-source/ingest/destination-connectors/motherduck",
"open-source/ingest/destination-connectors/onedrive",
"open-source/ingest/destination-connectors/opensearch",
"open-source/ingest/destination-connectors/pinecone",
@@ -372,6 +374,7 @@
"api-reference/ingest/destination-connector/databricks-volumes",
"api-reference/ingest/destination-connector/delta-table",
"api-reference/ingest/destination-connector/dropbox",
+ "api-reference/ingest/destination-connector/duckdb",
"api-reference/ingest/destination-connector/elasticsearch",
"api-reference/ingest/destination-connector/google-cloud-service",
"api-reference/ingest/destination-connector/kafka",
@@ -380,6 +383,7 @@
"api-reference/ingest/destination-connector/local",
"api-reference/ingest/destination-connector/milvus",
"api-reference/ingest/destination-connector/mongodb",
+ "api-reference/ingest/destination-connector/motherduck",
"api-reference/ingest/destination-connector/onedrive",
"api-reference/ingest/destination-connector/opensearch",
"api-reference/ingest/destination-connector/pinecone",
diff --git a/open-source/ingest/destination-connectors/duckdb.mdx b/open-source/ingest/destination-connectors/duckdb.mdx
new file mode 100644
index 00000000..c05d53cb
--- /dev/null
+++ b/open-source/ingest/destination-connectors/duckdb.mdx
@@ -0,0 +1,27 @@
+---
+title: DuckDB
+---
+
+import NewDocument from '/snippets/general-shared-text/new-document.mdx';
+
+
+
+import SharedDuckDB from '/snippets/dc-shared-text/duckdb-cli-api.mdx';
+
+
+
+Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector.
+
+This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page.
+
+import DuckDBAPISh from '/snippets/destination_connectors/duckdb.sh.mdx';
+import DuckDBAPIPyV2 from '/snippets/destination_connectors/duckdb.v2.py.mdx';
+
+
+
+
+
+
+import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
+
+
diff --git a/open-source/ingest/destination-connectors/motherduck.mdx b/open-source/ingest/destination-connectors/motherduck.mdx
new file mode 100644
index 00000000..d562121a
--- /dev/null
+++ b/open-source/ingest/destination-connectors/motherduck.mdx
@@ -0,0 +1,27 @@
+---
+title: MotherDuck
+---
+
+import NewDocument from '/snippets/general-shared-text/new-document.mdx';
+
+
+
+import SharedMotherDuck from '/snippets/dc-shared-text/motherduck-cli-api.mdx';
+
+
+
+Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector.
+
+This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page.
+
+import MotherDuckAPISh from '/snippets/destination_connectors/motherduck.sh.mdx';
+import MotherDuckAPIPyV2 from '/snippets/destination_connectors/motherduck.v2.py.mdx';
+
+
+
+
+
+
+import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
+
+
diff --git a/snippets/dc-shared-text/duckdb-cli-api.mdx b/snippets/dc-shared-text/duckdb-cli-api.mdx
new file mode 100644
index 00000000..5a3c49bc
--- /dev/null
+++ b/snippets/dc-shared-text/duckdb-cli-api.mdx
@@ -0,0 +1,9 @@
+Batch process all your records to store structured outputs in a DuckDB installation.
+
+The requirements are as follows.
+
+import SharedDuckDB from '/snippets/general-shared-text/duckdb.mdx';
+import SharedDuckDBCLIAPI from '/snippets/general-shared-text/duckdb-cli-api.mdx';
+
+
+
diff --git a/snippets/dc-shared-text/motherduck-cli-api.mdx b/snippets/dc-shared-text/motherduck-cli-api.mdx
new file mode 100644
index 00000000..f2676580
--- /dev/null
+++ b/snippets/dc-shared-text/motherduck-cli-api.mdx
@@ -0,0 +1,9 @@
+Batch process all your records to store structured outputs in a MotherDuck account.
+
+The requirements are as follows.
+
+import SharedMotherDuck from '/snippets/general-shared-text/motherduck.mdx';
+import SharedMotherDuckCLIAPI from '/snippets/general-shared-text/motherduck-cli-api.mdx';
+
+
+
diff --git a/snippets/destination_connectors/duckdb.sh.mdx b/snippets/destination_connectors/duckdb.sh.mdx
new file mode 100644
index 00000000..365817fd
--- /dev/null
+++ b/snippets/destination_connectors/duckdb.sh.mdx
@@ -0,0 +1,19 @@
+```bash CLI
+#!/usr/bin/env bash
+
+# Chunking and embedding are optional.
+
+unstructured-ingest \
+ local \
+ --input-path $LOCAL_FILE_INPUT_DIR \
+ --chunking-strategy by_title \
+ --embedding-provider huggingface \
+ --partition-by-api \
+ --api-key $UNSTRUCTURED_API_KEY \
+ --partition-endpoint $UNSTRUCTURED_API_URL \
+ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
+ duckdb \
+ --database $DUCKDB_DATABASE \
+ --db-schema $DUCKDB_DB_SCHEMA \
+ --table $DUCKDB_TABLE
+```
\ No newline at end of file
diff --git a/snippets/destination_connectors/duckdb.v2.py.mdx b/snippets/destination_connectors/duckdb.v2.py.mdx
new file mode 100644
index 00000000..e582a6a1
--- /dev/null
+++ b/snippets/destination_connectors/duckdb.v2.py.mdx
@@ -0,0 +1,51 @@
+```python Python Ingest v2
+import os
+
+from unstructured_ingest.v2.pipeline.pipeline import Pipeline
+from unstructured_ingest.v2.interfaces import ProcessorConfig
+
+from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
+ DuckDBAccessConfig,
+ DuckDBConnectionConfig,
+ DuckDBUploadStagerConfig,
+ DuckDBUploaderConfig
+)
+from unstructured_ingest.v2.processes.connectors.local import (
+ LocalIndexerConfig,
+ LocalConnectionConfig,
+ LocalDownloaderConfig
+)
+from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
+from unstructured_ingest.v2.processes.chunker import ChunkerConfig
+from unstructured_ingest.v2.processes.embedder import EmbedderConfig
+
+# Chunking and embedding are optional.
+
+if __name__ == "__main__":
+ Pipeline.from_configs(
+ context=ProcessorConfig(),
+ indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
+ downloader_config=LocalDownloaderConfig(),
+ source_connection_config=LocalConnectionConfig(),
+ partitioner_config=PartitionerConfig(
+ partition_by_api=True,
+ api_key=os.getenv("UNSTRUCTURED_API_KEY"),
+ partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
+ additional_partition_args={
+ "split_pdf_page": True,
+ "split_pdf_allow_failed": True,
+ "split_pdf_concurrency_level": 15
+ }
+ ),
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
+ destination_connection_config=DuckDBConnectionConfig(
+ access_config=DuckDBAccessConfig(),
+ database=os.getenv("DUCKDB_DATABASE"),
+ db_schema=os.getenv("DUCKDB_DB_SCHEMA"),
+ table=os.getenv("DUCKDB_TABLE")
+ ),
+ stager_config=DuckDBUploadStagerConfig(),
+ uploader_config=DuckDBUploaderConfig(batch_size=50)
+ ).run()
+```
\ No newline at end of file
diff --git a/snippets/destination_connectors/motherduck.sh.mdx b/snippets/destination_connectors/motherduck.sh.mdx
new file mode 100644
index 00000000..638bb2b9
--- /dev/null
+++ b/snippets/destination_connectors/motherduck.sh.mdx
@@ -0,0 +1,20 @@
+```bash CLI
+#!/usr/bin/env bash
+
+# Chunking and embedding are optional.
+
+unstructured-ingest \
+ local \
+ --input-path $LOCAL_FILE_INPUT_DIR \
+ --chunking-strategy by_title \
+ --embedding-provider huggingface \
+ --partition-by-api \
+ --api-key $UNSTRUCTURED_API_KEY \
+ --partition-endpoint $UNSTRUCTURED_API_URL \
+ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
+ motherduck \
+ --md-token $MOTHERDUCK_MD_TOKEN \
+ --database $MOTHERDUCK_DATABASE \
+ --db-schema $MOTHERDUCK_DB_SCHEMA \
+ --table $MOTHERDUCK_TABLE
+```
\ No newline at end of file
diff --git a/snippets/destination_connectors/motherduck.v2.py.mdx b/snippets/destination_connectors/motherduck.v2.py.mdx
new file mode 100644
index 00000000..5e657b14
--- /dev/null
+++ b/snippets/destination_connectors/motherduck.v2.py.mdx
@@ -0,0 +1,51 @@
+```python Python Ingest v2
+import os
+
+from unstructured_ingest.v2.pipeline.pipeline import Pipeline
+from unstructured_ingest.v2.interfaces import ProcessorConfig
+
+from unstructured_ingest.v2.processes.connectors.duckdb.motherduck import (
+ MotherDuckAccessConfig,
+ MotherDuckConnectionConfig,
+ MotherDuckUploadStagerConfig,
+ MotherDuckUploaderConfig
+)
+from unstructured_ingest.v2.processes.connectors.local import (
+ LocalIndexerConfig,
+ LocalConnectionConfig,
+ LocalDownloaderConfig
+)
+from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
+from unstructured_ingest.v2.processes.chunker import ChunkerConfig
+from unstructured_ingest.v2.processes.embedder import EmbedderConfig
+
+# Chunking and embedding are optional.
+
+if __name__ == "__main__":
+ Pipeline.from_configs(
+ context=ProcessorConfig(),
+ indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
+ downloader_config=LocalDownloaderConfig(),
+ source_connection_config=LocalConnectionConfig(),
+ partitioner_config=PartitionerConfig(
+ partition_by_api=True,
+ api_key=os.getenv("UNSTRUCTURED_API_KEY"),
+ partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
+ additional_partition_args={
+ "split_pdf_page": True,
+ "split_pdf_allow_failed": True,
+ "split_pdf_concurrency_level": 15
+ }
+ ),
+ chunker_config=ChunkerConfig(chunking_strategy="by_title"),
+ embedder_config=EmbedderConfig(embedding_provider="huggingface"),
+ destination_connection_config=MotherDuckConnectionConfig(
+ access_config=MotherDuckAccessConfig(md_token=os.getenv("MOTHERDUCK_MD_TOKEN")),
+ database=os.getenv("MOTHERDUCK_DATABASE"),
+ db_schema=os.getenv("MOTHERDUCK_DB_SCHEMA"),
+ table=os.getenv("MOTHERDUCK_TABLE")
+ ),
+ stager_config=MotherDuckUploadStagerConfig(),
+ uploader_config=MotherDuckUploaderConfig(batch_size=50)
+ ).run()
+```
\ No newline at end of file
diff --git a/snippets/general-shared-text/duckdb-cli-api.mdx b/snippets/general-shared-text/duckdb-cli-api.mdx
new file mode 100644
index 00000000..7fbf71f9
--- /dev/null
+++ b/snippets/general-shared-text/duckdb-cli-api.mdx
@@ -0,0 +1,15 @@
+The DuckDB connector dependencies:
+
+```bash CLI, Python
+pip install "unstructured-ingest[duckdb]"
+```
+
+import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx';
+
+
+
+The following environment variables:
+
+- `DUCKDB_DATABASE` - The path to the target DuckDB persistent database file with the extension `.db` or `.duckdb`, represented by `--database` (CLI) or `database` (Python).
+- `DUCKDB_DB_SCHEMA` - The name of the target schema in the database, represented by `--db-schema` (CLI) or `db_schema` (Python).
+- `DUCKDB_TABLE` - The name of the target table in the schema, represented by `--table` (CLI) or `table` (Python).
\ No newline at end of file
diff --git a/snippets/general-shared-text/duckdb.mdx b/snippets/general-shared-text/duckdb.mdx
new file mode 100644
index 00000000..daf4620d
--- /dev/null
+++ b/snippets/general-shared-text/duckdb.mdx
@@ -0,0 +1,80 @@
+- A [DuckDB installation](https://duckdb.org/docs/installation).
+- A [persistent database](https://duckdb.org/docs/connect/overview.html#persistent-database), for example by running the
+ [DuckDB CLI](https://duckdb.org/docs/api/cli) command `duckdb .db` or
+ `duckdb .duckdb`, replacing `` with the name of the target file.
+- The path to the target persistent database file.
+- A schema in the target database.
+
+ - [Create a schema](https://duckdb.org/docs/sql/statements/create_schema.html).
+ - You can list available schemas and their parent catalogs by running the following DuckDB CLI command:
+
+ ```sql
+ SELECT * FROM information_schema.schemata;
+ ```
+
+ The DuckDB connector uses the default schema name of `main` if not otherwise specified.
+
+- A table in the target schema.
+
+ - [Create a table](https://duckdb.org/docs/sql/statements/create_table).
+ - You can list available tables in a schema by running the following DuckDB CLI commands, replacing the target catalog and schema names:
+
+ ```sql
+ USE .;
+ SHOW TABLES;
+ ```
+
+ The DuckDB connector uses the default table name of `elements` if not otherwise specified.
+
+ For maximum compatibility, Unstructured recommends the following table schema:
+
+ ```sql
+ CREATE TABLE elements (
+ id VARCHAR,
+ element_id VARCHAR,
+ text TEXT,
+ embeddings FLOAT[],
+ type VARCHAR,
+ system VARCHAR,
+ layout_width DECIMAL,
+ layout_height DECIMAL,
+ points TEXT,
+ url TEXT,
+ version VARCHAR,
+ date_created INTEGER,
+ date_modified INTEGER,
+ date_processed DOUBLE,
+ permissions_data TEXT,
+ record_locator TEXT,
+ category_depth INTEGER,
+ parent_id VARCHAR,
+ attached_filename VARCHAR,
+ filetype VARCHAR,
+ last_modified TIMESTAMP,
+ file_directory VARCHAR,
+ filename VARCHAR,
+ languages VARCHAR[],
+ page_number VARCHAR,
+ links TEXT,
+ page_name VARCHAR,
+ link_urls VARCHAR[],
+ link_texts VARCHAR[],
+ sent_from VARCHAR[],
+ sent_to VARCHAR[],
+ subject VARCHAR,
+ section VARCHAR,
+ header_footer_type VARCHAR,
+ emphasized_text_contents VARCHAR[],
+ emphasized_text_tags VARCHAR[],
+ text_as_html TEXT,
+ regex_metadata TEXT,
+ detection_class_prob DECIMAL
+ );
+ ```
+
+ You can list the schema of a table by running the following DuckDB CLI commands, replacing the target catalog, schema, and table names:
+
+ ```sql
+ USE .;
+ DESCRIBE TABLE ;
+ ```
\ No newline at end of file
diff --git a/snippets/general-shared-text/motherduck-cli-api.mdx b/snippets/general-shared-text/motherduck-cli-api.mdx
new file mode 100644
index 00000000..5f99dd75
--- /dev/null
+++ b/snippets/general-shared-text/motherduck-cli-api.mdx
@@ -0,0 +1,16 @@
+The MotherDuck connector dependencies:
+
+```bash CLI, Python
+pip install "unstructured-ingest[duckdb]"
+```
+
+import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx';
+
+
+
+The following environment variables:
+
+- `MOTHERDUCK_MD_TOKEN` - The access token for the target MotherDuck account, represented by `--md-token` (CLI) or `md_token` (Python).
+- `MOTHERDUCK_DATABASE` - The name of the target database in the account, represented by `--database` (CLI) or `database` (Python).
+- `MOTHERDUCK_DB_SCHEMA` - The name of the target schema in the database, represented by `--db-schema` (CLI) or `db_schema` (Python).
+- `MOTHERDUCK_TABLE` - The name of the target table in the schema, represented by `--table` (CLI) or `table` (Python).
\ No newline at end of file
diff --git a/snippets/general-shared-text/motherduck.mdx b/snippets/general-shared-text/motherduck.mdx
new file mode 100644
index 00000000..ae844314
--- /dev/null
+++ b/snippets/general-shared-text/motherduck.mdx
@@ -0,0 +1,86 @@
+- A [MotherDuck account](https://app.motherduck.com).
+- A [MotherDuck access token](https://motherduck.com/docs/key-tasks/authenticating-and-connecting-to-motherduck/authenticating-to-motherduck/#creating-an-access-token) for the account.
+- A database in the account.
+
+ - [Create a database](https://motherduck.com/docs/sql-reference/motherduck-sql-reference/create-database/).
+ - [List available databases](https://motherduck.com/docs/key-tasks/database-operations/basics-operations/#listing-databases).
+
+ You can run commands to manage MotherDuck databases, schemas, tables, and more in the
+ [MotherDuck UI](https://motherduck.com/docs/getting-started/motherduck-quick-tour/) or for example by connecting to MotherDuck with the
+ [DuckDB CLI](https://motherduck.com/docs/key-tasks/authenticating-and-connecting-to-motherduck/connecting-to-motherduck/).
+
+- A schema in the target database.
+
+ - [Create a schema](https://duckdb.org/docs/sql/statements/create_schema.html).
+ - You can list available schemas and their parent catalogs by running the following command in the MotherDuck UI or the DuckDB CLI:
+
+ ```sql
+ SELECT * FROM information_schema.schemata;
+ ```
+
+ The MotherDuck connector uses the default schema name of `main` if not otherwise specified.
+
+- A table in the target schema.
+
+ - [Create a table](https://duckdb.org/docs/sql/statements/create_table).
+ - You can list available tables in a schema by running the following commands in the MotherDuck UI or the DuckDB CLI, replacing the target catalog and schema names:
+
+ ```sql
+ USE .;
+ SHOW TABLES;
+ ```
+
+ The MotherDuck connector uses the default table name of `elements` if not otherwise specified.
+
+ For maximum compatibility, Unstructured recommends the following table schema:
+
+ ```sql
+ CREATE TABLE elements (
+ id VARCHAR,
+ element_id VARCHAR,
+ text TEXT,
+ embeddings FLOAT[],
+ type VARCHAR,
+ system VARCHAR,
+ layout_width DECIMAL,
+ layout_height DECIMAL,
+ points TEXT,
+ url TEXT,
+ version VARCHAR,
+ date_created INTEGER,
+ date_modified INTEGER,
+ date_processed DOUBLE,
+ permissions_data TEXT,
+ record_locator TEXT,
+ category_depth INTEGER,
+ parent_id VARCHAR,
+ attached_filename VARCHAR,
+ filetype VARCHAR,
+ last_modified TIMESTAMP,
+ file_directory VARCHAR,
+ filename VARCHAR,
+ languages VARCHAR[],
+ page_number VARCHAR,
+ links TEXT,
+ page_name VARCHAR,
+ link_urls VARCHAR[],
+ link_texts VARCHAR[],
+ sent_from VARCHAR[],
+ sent_to VARCHAR[],
+ subject VARCHAR,
+ section VARCHAR,
+ header_footer_type VARCHAR,
+ emphasized_text_contents VARCHAR[],
+ emphasized_text_tags VARCHAR[],
+ text_as_html TEXT,
+ regex_metadata TEXT,
+ detection_class_prob DECIMAL
+ );
+ ```
+
+ You can list the schema of a table by running the following commands in the MotherDuck UI or the DuckDB CLI, replacing the target catalog, schema, and table names:
+
+ ```sql
+ USE .;
+ DESCRIBE TABLE ;
+ ```
\ No newline at end of file