In [1]:
import os
from dotenv import load_dotenv
import chromadb
chroma_client = chromadb.Client()

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if OPENAI_API_KEY is None:
    raise Exception("API key not found in environment variables")

In [2]:
similar1="The food was delicious and the waiter..."
similar2="I liked the food but the..."
different="Last night I read a book"
between="I liked the book I read"

In [3]:
from chromadb.utils import embedding_functions

embedder = embedding_functions.OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
    model_name="text-embedding-ada-002",
)

emb_sim1 = embedder(similar1)[0]
emb_sim2= embedder(similar2)[0]

print(f" [{emb_sim1[0]},{emb_sim1[1]}...{emb_sim1[-1]}] {len(emb_sim1)}")

 [-0.003936287481337786,-0.014514213427901268...-0.015028230845928192] 1536


In [4]:
collection = chroma_client.get_or_create_collection(name="test", embedding_function=embedder)

In [5]:
collection.add( # embedding external
    embeddings=[emb_sim1],
    documents=[similar1],
    metadatas=[{"sentiment": "positive", "type": "similar", "topic": "food"}],
    ids=["id1"]
) 
collection.add( # store documents elsewhere
    embeddings=[emb_sim2],
    metadatas=[{"sentiment": "mixed", "type": "similar","topic": "food"}],
    ids=["id2"]
) 
collection.add( # embedding by defined function
    documents=[different, between],
    metadatas=[{"sentiment": "neutral", "type": "different","topic": "read"}, {"sentiment": "positive", "type": "between","topic": "read"}],
    ids=["id3","id4"]
) 

In [6]:
# print(collection.peek()) returns a list of the first 10 items in the collection
print(collection.count()) # returns the number of items in the collection
# collection.modify(name="new_name") Rename the collection

4


In [7]:
positive="I really like going there the other day"
negative="Man it just wasn't doing it for me it all"

In [8]:
collection.query(
    query_texts=[positive],
    where={"sentiment": "positive"},
    n_results=10,
)

Number of requested results 10 is greater than number of elements in index 4, updating n_results = 4


{'ids': [['id4', 'id1']],
 'embeddings': None,
 'documents': [['I liked the book I read',
   'The food was delicious and the waiter...']],
 'metadatas': [[{'sentiment': 'positive', 'type': 'between', 'topic': 'read'},
   {'sentiment': 'positive', 'type': 'similar', 'topic': 'food'}]],
 'distances': [[0.34555989503860474, 0.5193383693695068]]}

In [9]:
collection.get(
    ids=["id1", "id2", "id3"],
    where={"sentiment": "positive"}
)

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['The food was delicious and the waiter...'],
 'metadatas': [{'sentiment': 'positive', 'type': 'similar', 'topic': 'food'}]}

In [10]:
collection.query(
    query_embeddings=[embedder(negative)[0]],
    where_document={"$contains":"I"}
)

Number of requested results 10 is greater than number of elements in index 4, updating n_results = 4


{'ids': [['id4', 'id3']],
 'embeddings': None,
 'documents': [['I liked the book I read', 'Last night I read a book']],
 'metadatas': [[{'sentiment': 'positive', 'type': 'between', 'topic': 'read'},
   {'sentiment': 'neutral', 'type': 'different', 'topic': 'read'}]],
 'distances': [[0.49588629603385925, 0.5017977356910706]]}

In [11]:
collection.upsert( # updating or adding based on id
    ids=["id1", "id5"],
    metadatas=[{"sentiment": "negative", "type": "similar", "topic": "food"}, {"sentiment": "negative", "type": "different", "topic": "sky"}],
    documents=["Man that food was awful", "The sky is quite blue today isn't it"],
)
collection.peek()['ids']

['id2', 'id3', 'id4', 'id1', 'id5']

In [12]:
collection.delete(
    ids=["id1", "id2", "id3", "id4"],
    where={"sentiment": "positive"}
)
collection.peek()['ids']

['id2', 'id3', 'id1', 'id5']

In [13]:
chroma_client.delete_collection(name="test") # Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible