# Sample use of a chroma db


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

from be.data.utils import get_resource
from loguru import logger as lg

### Minimal

https://docs.trychroma.com/getting-started


In [None]:
import chromadb

In [None]:
chroma_fol = get_resource("chroma_fol")

In [None]:
client = chromadb.PersistentClient(path=str(chroma_fol))

In [None]:
# client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.
# client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

In [None]:
collection = client.create_collection(name="my_collection")

In [None]:
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges",
    ],
    ids=["id1", "id2"],
)

In [None]:
results = collection.query(
    query_texts=[
        "This is a query document about hawaii"
    ],  # Chroma will embed this for you
    n_results=2,  # how many results to return
)
print(results)

In [None]:
collection.peek()  # returns a list of the first 10 items in the collection
collection.count()  # returns the number of items in the collection
# collection.modify(name="new_name") # Rename the collection

### Guides

https://docs.trychroma.com/guides


#### Adding data

https://docs.trychroma.com/guides#adding-data-to-a-collection


In [None]:
# add documents with metadata
collection.add(
    documents=["lorem ipsum...", "doc2", "doc3"],
    metadatas=[
        {"chapter": "3", "verse": "16"},
        {"chapter": "3", "verse": "5"},
        {"chapter": "29", "verse": "11"},
    ],
    ids=["id1", "id2", "id3"],
)

# if you supply the embeddings, Chroma will not calculate them
# embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2], ...],

# actually documents are optional, you can store IDs and embeddings only

#### Querying

https://docs.trychroma.com/guides#querying-a-collection


In [None]:
# query using embeddings
collection.query(
    query_embeddings=[[11.1, 12.1, 13.1], [1.1, 2.3, 3.2]],
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains": "search_string"},
)

In [None]:
# query using text, they will be embedded on the spot
collection.query(
    query_texts=["doc10", "thus spake zarathustra"],
    n_results=10,
    where={"metadata_field": "is_equal_to_this"},
    where_document={"$contains": "search_string"},
)

In [None]:
# query using IDs
collection.get(
    ids=["id1", "id2", "id3"],
    where={"style": "style1"},
)

# .get also supports the where and where_document filters. If no ids are supplied, it will return all items in the collection that match the where and where_document filters.

In [None]:
# Choosing which data is returned

# When using get or query you can use the include parameter to specify which
# data you want returned - any of embeddings, documents, metadatas, and for
# query, distances.
# By default, Chroma will return the documents, metadatas and in the case of
# query, the distances of the results.
# embeddings are excluded by default for performance and the ids are always returned.
# You can specify which of these you want returned by passing an array of
# included field names to the includes parameter of the query or get method.

collection.get(include=["documents"])

collection.query(
    query_embeddings=[[11.1, 12.1, 13.1], [1.1, 2.3, 3.2]],
    include=["documents"],
)

#### Using Where filters

https://docs.trychroma.com/guides#using-where-filters


### Hashed


In [None]:
import hashlib
from typing import Any

from chromadb.api.types import ID, Document, OneOrMany


class HashedCollection:
    def __init__(self, collection) -> None:
        self.collection = collection

    def generate_id(self, document) -> ID:
        return hashlib.md5(document.encode("utf-8")).hexdigest()

    def add(
        self,
        documents: OneOrMany[Document],
        ids: OneOrMany[ID] | None = None,
        *args,
        **kwargs,
    ) -> None:
        # generate IDs if they are not provided
        if ids is None:
            ids = [self.generate_id(doc) for doc in documents]
        elif len(ids) != len(documents):
            raise ValueError("The number of ids must match the number of documents")

        self.collection.add(documents=documents, ids=ids, *args, **kwargs)

    def __getattr__(self, name) -> Any:
        # delegate attribute access to the original collection
        return getattr(self.collection, name)

    # def get_doc