In [19]:
import pyarrow as pa

from haystack import Document, Pipeline
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.fastembed import FastembedDocumentEmbedder, FastembedTextEmbedder

from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever, LanceDBFTSRetriever


In [24]:
lance_db_path = 'lancedb'
metadata_schema = pa.struct([
    ('author', pa.string()),
    ('title', pa.string()),
    ('url', pa.string())
])
document_store = LanceDBDocumentStore(database=lance_db_path, table_name="documents", metadata_schema=metadata_schema, embedding_dims=384)

pa.field("id", pa.string(), nullable=False),

embedding_retriever = LanceDBEmbeddingRetriever(document_store)

In [25]:
import lancedb
import pyarrow as pa

# connect to LanceDB Cloud
db = lancedb.connect(
    uri=lance_db_path
)

# create an empty table with schema
data = [
    {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
    {"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
    {"vector": [10.2, 100.8], "item": "baz", "price": 30.0},
    {"vector": [1.4, 9.5], "item": "fred", "price": 40.0},
]

schema = pa.schema([
    pa.field("vector", pa.list_(pa.float32(), 2)),
    pa.field("item", pa.utf8()),
    pa.field("price", pa.float32()),
])

table_name = "basic_ingestion_example"
table = db.create_table(table_name, schema=schema, mode="overwrite")
# Add data
table.add(data)

[90m[[0m2025-11-21T09:24:26Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /mnt/c/Users/Me/PycharmProjects/Data-Science/haystack/Document Stores/LanceDB/lancedb/basic_ingestion_example.lance, it will be created


AddResult(version=2)

In [21]:
# Source - https://stackoverflow.com/q
# Posted by Hans
# Retrieved 2025-11-21, License - CC BY-SA 4.0
import pandas as pd
df1 = pd.DataFrame(True, index=[0], columns=[0], dtype='bool[pyarrow]')
# Source - https://stackoverflow.com/q
# Posted by Hans
# Retrieved 2025-11-21, License - CC BY-SA 4.0

df2 = pd.DataFrame(True, index=[0], columns=[0], dtype='bool')


In [22]:
# Source - https://stackoverflow.com/q
# Posted by Hans
# Retrieved 2025-11-21, License - CC BY-SA 4.0

df1.values.dtype,df2.values.dtype


(dtype('bool'), dtype('bool'))

In [23]:
pa.schema(
        [
            pa.field("id", pa.string(), nullable=False),
            pa.field("vector", pa.list_(pa.float32(), list_size=3)),
            pa.field("content", pa.string()),
            pa.field("dataframe", pa.string()),  # Using a string, so we can jam the dataframe in as json.
            pa.field("blob", pa.binary())
        ]
)

id: string not null
vector: fixed_size_list<item: float>[3]
  child 0, item: float
content: string
dataframe: string
blob: binary

In [12]:
pa.binary(1)

FixedSizeBinaryType(fixed_size_binary[1])

In [14]:
import lancedb

In [18]:
db = lancedb.connect("play")
schema = pa.schema(
        [
            pa.field("id", pa.string(), nullable=False),
            pa.field("vector", pa.list_(pa.float32(), list_size=300)),
            pa.field("content", pa.string()),
            pa.field("dataframe", pa.string()),  # Using a string, so we can jam the dataframe in as json.
            pa.field("blob", pa.binary()),
        ]
    )
table = db.create_table(name="lol", schema=schema, on_bad_vectors="fill", fill_value=0)
table

ValueError: Table 'lol' already exists

In [17]:
from haystack import Document
d = Document("asa")

print(d.to_dict(flatten=False))

{'id': 'asa', 'content': None, 'blob': None, 'meta': {}, 'score': None, 'embedding': None, 'sparse_embedding': None}


In [None]:
column_data = {}
for field in schema:
    values = []
    for doc in doc_dicts:
        val = doc.get(field.name)
        values.append(val)
    column_data[field.name] = values

new_data = pa.table(column_data, schema=schema)