In [11]:
import os
from dotenv import load_dotenv
import chromadb
chroma_client = chromadb.Client()

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if OPENAI_API_KEY is None:
    raise Exception("API key not found in environment variables")

In [8]:
similar1="The food was delicious and the waiter..."
similar2="I liked the food but the..."
different="Last night I read a book"
between="I liked the book I read"

In [20]:
from chromadb.utils import embedding_functions

embedder = embedding_functions.OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
    model_name="text-embedding-ada-002",
)

emb_sim1 = embedder(similar1)[0]
emb_sim2= embedder(similar2)[0]

emb_sim1

[-0.004019643645733595,
 -0.014482887461781502,
 0.013468679040670395,
 -0.005199505016207695,
 -0.028451908379793167,
 0.008221844211220741,
 -0.02050052024424076,
 -0.016646530479192734,
 -0.005787745583802462,
 -0.01807994395494461,
 0.014415273442864418,
 0.029425548389554024,
 0.022988710552453995,
 -0.009283381514251232,
 -0.015767550095915794,
 -0.0016945721581578255,
 0.048465609550476074,
 0.006048059090971947,
 0.017714830115437508,
 -0.031264644116163254,
 -0.009236052632331848,
 0.002461144234985113,
 -0.004347570706158876,
 -0.007904059253633022,
 0.001272830762900412,
 0.002829639706760645,
 0.013651236891746521,
 -0.03277919441461563,
 0.01729562319815159,
 -0.016173234209418297,
 0.015645844861865044,
 -0.013272599317133427,
 -0.025328148156404495,
 -0.03510511294007301,
 -0.02923622913658619,
 -0.022583026438951492,
 -0.01542948093265295,
 -0.0071805911138653755,
 0.002865137066692114,
 0.0016649911412969232,
 0.01749846525490284,
 0.032508738338947296,
 -0.00061570864

In [24]:
collection = chroma_client.get_or_create_collection(name="test", embedding_function=embedder)

In [25]:
collection.add( # embedding external
    embeddings=[emb_sim1],
    documents=[similar1],
    metadatas=[{"sentiment": "positive", "type": "similar", "topic": "food"}],
    ids=["id1"]
) 
collection.add( # store documents elsewhere
    embeddings=[emb_sim2],
    metadatas=[{"sentiment": "mixed", "type": "similar","topic": "food"}],
    ids=["id2"]
) 
collection.add( # embedding by defined function
    documents=[different, between],
    metadatas=[{"sentiment": "neutral", "type": "different","topic": "read"}, {"sentiment": "positive", "type": "between","topic": "read"}],
    ids=["id3","id4"]
) 

In [26]:
print(collection.peek()) # returns a list of the first 10 items in the collection
print(collection.count()) # returns the number of items in the collection
# collection.modify(name="new_name") Rename the collection

{'ids': ['id1', 'id2', 'id3', 'id4'], 'embeddings': [[-0.004019643645733595, -0.014482887461781502, 0.013468679040670395, -0.005199505016207695, -0.028451908379793167, 0.008221844211220741, -0.02050052024424076, -0.016646530479192734, -0.005787745583802462, -0.01807994395494461, 0.014415273442864418, 0.029425548389554024, 0.022988710552453995, -0.009283381514251232, -0.015767550095915794, -0.0016945721581578255, 0.048465609550476074, 0.006048059090971947, 0.017714830115437508, -0.031264644116163254, -0.009236052632331848, 0.002461144234985113, -0.004347570706158876, -0.007904059253633022, 0.001272830762900412, 0.002829639706760645, 0.013651236891746521, -0.03277919441461563, 0.01729562319815159, -0.016173234209418297, 0.015645844861865044, -0.013272599317133427, -0.025328148156404495, -0.03510511294007301, -0.02923622913658619, -0.022583026438951492, -0.01542948093265295, -0.0071805911138653755, 0.002865137066692114, 0.0016649911412969232, 0.01749846525490284, 0.032508738338947296, -0.

In [29]:
positive="I really like going there the other day"
negative="Man it just wasn't doing it for me it all"

In [33]:
collection.query(
    query_texts=[positive],
    where={"sentiment": "positive"},
    n_results=10,
)

Number of requested results 10 is greater than number of elements in index 4, updating n_results = 4


{'ids': [['id4', 'id1']],
 'embeddings': None,
 'documents': [['I liked the book I read',
   'The food was delicious and the waiter...']],
 'metadatas': [[{'sentiment': 'positive', 'type': 'between', 'topic': 'read'},
   {'sentiment': 'positive', 'type': 'similar', 'topic': 'food'}]],
 'distances': [[0.34555989503860474, 0.5198448896408081]]}

In [35]:
collection.get(
    ids=["id1", "id2", "id3"],
    where={"sentiment": "positive"}
)

{'ids': ['id1'],
 'embeddings': None,
 'documents': ['The food was delicious and the waiter...'],
 'metadatas': [{'sentiment': 'positive', 'type': 'similar', 'topic': 'food'}]}

In [34]:
collection.query(
    query_embeddings=[embedder(negative)[0]],
    where_document={"$contains":"I"}
)

Number of requested results 10 is greater than number of elements in index 4, updating n_results = 4


{'ids': [['id4', 'id3']],
 'embeddings': None,
 'documents': [['I liked the book I read', 'Last night I read a book']],
 'metadatas': [[{'sentiment': 'positive', 'type': 'between', 'topic': 'read'},
   {'sentiment': 'neutral', 'type': 'different', 'topic': 'read'}]],
 'distances': [[0.49588629603385925, 0.5017977356910706]]}

In [37]:
collection.upsert( # updating or adding based on id
    ids=["id1", "id5"],
    metadatas=[{"sentiment": "negative", "type": "similar", "topic": "food"}, {"sentiment": "negative", "type": "different", "topic": "sky"}],
    documents=["Man that food was awful", "The sky is quite blue today isn't it"],
)
collection.peek()

{'ids': ['id2', 'id3', 'id4', 'id1', 'id5'],
 'embeddings': [[-0.013171016238629818,
   -0.033772554248571396,
   0.007744781207293272,
   -0.014609632082283497,
   -0.020825011655688286,
   0.019386395812034607,
   -0.014372190460562706,
   -0.021160222589969635,
   0.012968492694199085,
   -0.005653201136738062,
   0.01719355396926403,
   0.01769637130200863,
   0.0158387403935194,
   0.005244662519544363,
   -0.010573127306997776,
   -0.000684390019159764,
   0.02891198731958866,
   0.010824535973370075,
   0.012758985161781311,
   -0.012940558604896069,
   -0.012521544471383095,
   0.0015800331020727754,
   -0.011201648972928524,
   -0.01596444472670555,
   -0.009553526528179646,
   0.00933703500777483,
   0.018785808235406876,
   -0.014888974837958813,
   0.009728115051984787,
   -0.0319288894534111,
   0.042124904692173004,
   -0.015196251682937145,
   -0.008820250630378723,
   -0.021020550280809402,
   -0.019749540835618973,
   -0.011041026562452316,
   -0.009399887174367905,
  

In [38]:
collection.delete(
    ids=["id1", "id2", "id3", "id4"],
    where={"sentiment": "positive"}
)
collection.peek()

{'ids': ['id2', 'id3', 'id1', 'id5'],
 'embeddings': [[-0.013171016238629818,
   -0.033772554248571396,
   0.007744781207293272,
   -0.014609632082283497,
   -0.020825011655688286,
   0.019386395812034607,
   -0.014372190460562706,
   -0.021160222589969635,
   0.012968492694199085,
   -0.005653201136738062,
   0.01719355396926403,
   0.01769637130200863,
   0.0158387403935194,
   0.005244662519544363,
   -0.010573127306997776,
   -0.000684390019159764,
   0.02891198731958866,
   0.010824535973370075,
   0.012758985161781311,
   -0.012940558604896069,
   -0.012521544471383095,
   0.0015800331020727754,
   -0.011201648972928524,
   -0.01596444472670555,
   -0.009553526528179646,
   0.00933703500777483,
   0.018785808235406876,
   -0.014888974837958813,
   0.009728115051984787,
   -0.0319288894534111,
   0.042124904692173004,
   -0.015196251682937145,
   -0.008820250630378723,
   -0.021020550280809402,
   -0.019749540835618973,
   -0.011041026562452316,
   -0.009399887174367905,
   -0.021

In [23]:
chroma_client.delete_collection(name="test") # Delete a collection and all associated embeddings, documents, and metadata. ⚠️ This is destructive and not reversible