In [1]:
import chromadb

In [6]:
chroma_client = chromadb.Client() # emphermeral client
# chroma_client = chromadb.PersistentClient(path="chroma_db") # uses sqlite3 with persistent storage

In [None]:
collection = chroma_client.get_or_create_collection(# https://docs.trychroma.com/docs/embeddings/embedding-functions
    name="pp_blog",
    metadata={
        "description": "A collection of blog posts from the PocketPandit blog",
        "source": "https://blog.pocketpandit.com",
        "hnsw:space": "cosine", # https://docs.trychroma.com/docs/collections/configure
            # cosine (for cosine similarity)
            # l2 or euclidean (for Euclidean distance)
            # ip or inner_product (for inner product)
        "hnsw:construction_ef": 128, # default is 100
        "hnsw:M": 32, # default is 16
        "hnsw:search_ef": 128,
    },
    )
# Custom embedding function https://docs.trychroma.com/docs/embeddings/embedding-functions
# collection = chroma_client.create_collection(name="pp_blog", embedding_function=emb_fn)
# collection = chroma_client.get_collection(name="pp_blog", embedding_function=emb_fn)
# chroma_client.delete_collection(name="my_collection")
# collection.peek() # returns the first 5 items in the collection
# collection.count() # returns the number of items in the collection


In [8]:
collection.database.count('chroma_db') # returns the number of items in the collection

0

In [None]:
# add data
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    # embeddings=[]
    metadatas=[
        {"id": "id1", "title": "Pineapple"},
        {"id": "id2", "title": "Oranges"}
    ],
    ids=["id1", "id2"]
)


In [6]:
# update data
collection.update(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    # embeddings=[]
    metadatas=[
        {"id": "id1", "title": "Pineapple"},
        {"id": "id2", "title": "Oranges"}
    ],
    ids=["id1", "id2"]
)

In [7]:
# upsert data (update if exists, else insert)
collection.upsert(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    # embeddings=[]
    metadatas=[
        {"id": "id1", "title": "Pineapple"},
        {"id": "id2", "title": "Oranges"}
    ],
    ids=["id1", "id2"]
)

In [8]:
# delete data
collection.delete(ids=["id1", "id2"],where={"title": "Pineapple"})

In [9]:
# query
results = collection.query(
    # query_embeddings=[]
    query_texts=["document of huwai"], # Chroma will embed this for you
    # query_uris=[],
    n_results=2, # how many results to return
    # where={"title": "Oranges"}, # case sensitive
    # where_document={"$contains": "orange"}, # ["$contains", "$not_contains", "$and", "$or"]
)
print(results)


Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


{'ids': [['id2']], 'embeddings': None, 'documents': [['This is a document about oranges']], 'uris': None, 'data': None, 'metadatas': [[{'id': 'id2', 'title': 'Oranges'}]], 'distances': [[0.6894368746509796]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [10]:
# get
result = collection.get(
    ids=["id1", "id2"],
    where={
        "title": "Oranges",
        # "title": {"$in":["Oranges", "Pineapple"]}
        # "title": {"$eq": "Oranges"}, # eq,ne,gt,gte,lt,lte # can combine multiple filters # https://docs.trychroma.com/docs/querying-collections/metadata-filtering
        },
    limit= 2,
    # offset= 1,
    where_document= {"$contains": "orange"},
    include= ["metadatas", "documents"]
    )
print(result)

{'ids': ['id2'], 'embeddings': None, 'documents': ['This is a document about oranges'], 'uris': None, 'data': None, 'metadatas': [{'id': 'id2', 'title': 'Oranges'}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [13]:
# full text search
results = collection.query(
    query_texts=["document of huwai"],
    n_results=2,
    where={"title": "Oranges"},
    where_document={"$contains":"is a"}, # to search in the document
)
print(results)

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


{'ids': [['id2']], 'embeddings': None, 'documents': [['This is a document about oranges']], 'uris': None, 'data': None, 'metadatas': [[{'id': 'id2', 'title': 'Oranges'}]], 'distances': [[0.6894368746509796]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
