In [None]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_chroma import Chroma

In [None]:
embeddigs = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
vector_store = Chroma(
    collection_name="example",
    embedding_function=embeddigs,
)

In [15]:
vector_store = Chroma(
    persist_directory='./db',
    collection_name='example',
    embedding_function=embeddigs
)

In [None]:
# Chroma Clients
import chromadb
client = chromadb.Client()

In [None]:
client = chromadb.PersistentClient(path="./db")

In [None]:
collection = client.get_or_create_collection('example')
collection

Collection(name=example)

In [None]:
collection.add(ids=['1', '2', '3'], documents=['a', 'b', 'c'])

C:\Users\malik\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:31<00:00, 2.68MiB/s]


In [None]:
vector_store_from_client = Chroma(
    client=client,
    collection_name="example",
    embedding_function=embeddigs,
    persist_directory='./db'
)

In [18]:
# Manage Vector Store.
from uuid import uuid4
from langchain.schema.document import Document


vector_store=Chroma(
    persist_directory="./db",
    collection_name="new",
    embedding_function=embeddigs
)

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

uuids=[str(uuid4()) for _ in range(len(documents))]
vector_store.add_documents(documents,ids=uuids)

['bc6d7c57-e088-4ac1-8998-8339ab95b170',
 '47459fe8-bac5-40b5-aded-8365affa4c0b',
 'c36014de-61fd-4b29-a8c1-f020f3be85c3',
 '0131f263-7b69-40e6-a20c-aecbdc940d21',
 'a3948250-6bf9-4fa2-8a20-87284f61c63a',
 '5895976e-d4ed-49c7-88cb-d8272e9d6edd',
 'a2939ad4-1109-4fa7-996a-eaaf9012bb64',
 '1158a33c-34e8-4113-9e10-e7a299313673',
 '5211f5c9-ca6e-4a0c-abe6-9ce2c67e63f6',
 '1cf4f613-eb7b-4991-bd3e-358a3821c70d']

In [20]:
updated_document_1 = Document(
    page_content="I had chocolate chip pancakes and fried eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

updated_document_2 = Document(
    page_content="The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees.",
    metadata={"source": "news"},
    id=2,
)

vector_store.update_document(document_id=uuids[0],document=updated_document_1)
vector_store.update_documents(
    ids=uuids[:2],documents=[updated_document_1,updated_document_2]
)

In [21]:
vector_store.delete(ids=uuids[-1])

In [24]:
# Query 
results=vector_store.similarity_search("LangChain provides abstractions to make working with LLMs easy",
    k=2,filter={'source':'tweet'})
results

[Document(id='c36014de-61fd-4b29-a8c1-f020f3be85c3', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='1158a33c-34e8-4113-9e10-e7a299313673', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [30]:
results=vector_store.similarity_search_with_score("Will it be hot tomorrow?",k=2,filter={'source':'news'})
results
for res,score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.807253] The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees. [{'source': 'news'}]
* [SIM=1.726037] The stock market is down 500 points today due to fears of a recession. [{'source': 'news'}]


In [31]:
# Seach by vector
results=vector_store.similarity_search_by_vector(
    embedding=embeddigs.embed_query("I love green eggs and ham!"),k=1
)
for doc in results:
    print(f"** {doc.page_content} [{doc.metadata}]")

** I had chocolate chip pancakes and fried eggs for breakfast this morning. [{'source': 'tweet'}]


In [37]:
# Query trunint into retriever
retriever=vector_store.as_retriever(
    search_type="mmr",search_kwargs={"k":1,"fetch_k":5}
)

result=retriever.invoke("Stealing from the bank is a crime",filter={"source":"news"})
for doc in result:
    print(f"** {doc.page_content} [{doc.metadata}]")

** Robbers broke into the city bank and stole $1 million in cash. [{'source': 'news'}]
