# Chroma DB

## Initialie a connection to Chroma DB

First you need to pull and run a chromaDb docker image: 
```
docker pull ...
```

Then we define the 


In [1]:
# import chromadb
from chromadb import HttpClient, Documents, EmbeddingFunction, Embeddings
import uuid
import time
import numpy as np

from utils.embedder import embedd_sequences

chroma_client = HttpClient(host="localhost", port=8000)
chroma_client.heartbeat()


class MyEmbeddingFunction(EmbeddingFunction):
    emb_func = None

    def __init__(self, func):
        self.emb_func = func

    def __call__(self, input: Documents) -> Embeddings:
        return self.emb_func(input)


def get_collection(collection_name, embedding_func=embedd_sequences, metadata=None):

    if embedding_func:
        my_embedding_function = MyEmbeddingFunction(embedding_func)

        collection = chroma_client.get_or_create_collection(
            name=collection_name, embedding_function=my_embedding_function, metadata=metadata
        )
    else:
        collection = chroma_client.get_or_create_collection(name=collection_name, metadata=metadata)

    return collection



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def _generate_uuid_ids(n):
    """Generate array of n UUID-based IDs"""
    return [str(uuid.uuid4()) for _ in range(n)]


def insert_documents_chroma(collection, documents=None, embeddings=None):

    if (embeddings == None) == (documents == None):
        raise "Either documents or embeddings must be provided, but not both."

    ids = _generate_uuid_ids(len(documents) if documents else len(embeddings))

    collection.upsert(documents=documents, embeddings=embeddings, ids=ids)

In [3]:
def get_documents_chroma(collection, query_texts=None, query_embeddings=None, top_k=1):

    if (query_texts == None) == (query_embeddings == None):
        raise "Either query_texts or query_embeddings must be provided, but not both."

    results = collection.query(
        query_texts=query_texts,
        query_embeddings=query_embeddings,
        n_results=top_k,
    )

    return results

In [12]:
# For Better accuracy  
collection = get_collection(
    "accurate_search", 
    metadata={  
            "hnsw:space": "cosine",  # or "l2", "ip" depending on your needs  
            "hnsw:construction_ef": 200, # More accurate but slower to build 
            "hnsw:search_ef": 100  # More accurate search but slower  
        } 
)  


 # Create some random vectors and metadata
num_vectors = 99999
vectors = (
    np.random.random((num_vectors, 128)).astype(np.float32)
)  # Ensure float32 type
arr1, arr2, arr3 = np.split(vectors, 3)


 # Insert vectors
t1 = time.time()
for arr in [arr1, arr2, arr3]:
    insert_documents_chroma(collection, embeddings=arr.tolist())
print(f"Inserted {num_vectors} vectors in {time.time() - t1:.2f}s")

# Perform a search
t1 = time.time()
query = np.random.random(128).astype(np.float32).tolist()  # Ensure float32 type
for _ in range(10):
    results = get_documents_chroma(collection, query_embeddings=query, top_k=5)
print(f"Search time: {time.time() - t1:.2f}s")

print("\nSearch results:")
for i in range(len(results["ids"])):
    print(f"ID: {results["ids"][i]}, Distance: {results["distances"][i]}")

chroma_client.delete_collection(name="accurate_search")


Inserted 99999 vectors in 147.54s
Search time: 8.69s

Search results:
ID: ['54dbc9b7-0f98-44eb-9e29-3be522f8d506', 'b04c54b9-69cf-45f5-9c95-dfbf5769b039', 'f4ab6953-c865-478c-83a8-39095c3fb49d', '662e3482-853b-47d0-8f2a-b758d2751cb8', '3187c38c-2c2c-4c4d-b30c-30ee76669897'], Distance: [0.15586870908737183, 0.1572704315185547, 0.15884286165237427, 0.16087007522583008, 0.1609046459197998]


In [14]:
# For Better accuracy  
collection = get_collection(
    "fast_search", 
    metadata={  
            "hnsw:space": "cosine", 
            "hnsw:search_ef": 20,  # Faster search but less accurate  
            "hnsw:construction_ef": 40 # Faster to build but less accurate  
        } 
)  


 # Create some random vectors and metadata
num_vectors = 99999
vectors = (
    np.random.random((num_vectors, 128)).astype(np.float32)
)  # Ensure float32 type
arr1, arr2, arr3 = np.split(vectors, 3)


 # Insert vectors
t1 = time.time()
for arr in [arr1, arr2, arr3]:
    insert_documents_chroma(collection, embeddings=arr.tolist())
print(f"Inserted {num_vectors} vectors in {time.time() - t1:.2f}s")

# Perform a search
t1 = time.time()
query = np.random.random(128).astype(np.float32).tolist()  # Ensure float32 type
for _ in range(10):
    results = get_documents_chroma(collection, query_embeddings=query, top_k=5)
print(f"Search time: {time.time() - t1:.2f}s")

print("\nSearch results:")
for i in range(len(results["ids"])):
    print(f"ID: {results["ids"][i]}, Distance: {results["distances"][i]}")

chroma_client.delete_collection(name="fast_search")

Inserted 99999 vectors in 113.03s
Search time: 7.71s

Search results:
ID: ['9e10b4cf-6c86-45dd-ae87-b7a4be0143b8', 'b17781b6-4111-4aea-b40e-c0fbc9187c82', 'aa70b542-00fb-48f8-ad7a-07b7f27144ad', '79d4f744-c3f3-42e7-8319-62b24d4834af', 'e3176842-48ca-40f9-bede-0c233cdbb239'], Distance: [0.162758469581604, 0.1653081178665161, 0.1695802423607744, 0.17163366079330444, 0.17198646068572998]
