In [1]:
pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.7 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-macosx_13_0_universal2.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 k

In [1]:
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")

In [2]:
collection = client.create_collection(name="collection1")

# Insert sample data (ID, embeddings, metadata)
collection.add(
    ids=["1", "2", "3"],
    embeddings=[[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.3, 0.4, 0.5]],
    metadatas= [{"name": "Twinkle"}, {"name": "Ajay"}, {"name":"Educosys"}]
)

In [3]:
print("Available Collections:", client.list_collections())

Available Collections: [Collection(name=collection1)]


In [4]:
print("Fetching data with ID 2:", collection.get(ids=["2"]))

Fetching data with ID 1: {'ids': ['2'], 'embeddings': None, 'documents': [None], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'name': 'Ajay'}]}


In [6]:
print("fetching data with ID 2:", collection.get(ids=["2"], include=["embeddings", "metadatas"]))

fetching data with ID 2: {'ids': ['2'], 'embeddings': array([[0.2       , 0.30000001, 0.40000001]]), 'documents': None, 'uris': None, 'included': ['embeddings', 'metadatas'], 'data': None, 'metadatas': [{'name': 'Ajay'}]}


In [8]:
collection.add(
    ids=["4"],
    embeddings=[[0.1, 0.2, 0.3]],
    documents=["Someone is a software engineer with 5 years of experience."]
)

In [9]:
collection.query(
    query_embeddings=[[0.3, 0.4, 0.5]],
    n_results=2,
    include=["documents"]
)

{'ids': [['3', '2']],
 'embeddings': None,
 'documents': [[None, None]],
 'uris': None,
 'included': ['documents'],
 'data': None,
 'metadatas': None,
 'distances': None}

In [11]:
collection.update(
    ids=["1"],
    embeddings=[[0.5, 0.5, 0.5]],
    metadatas= [{"name": "Vinnie"}]
)
print("Updated Entry:", collection.get(ids=["1"]))

Updated Entry: {'ids': ['1'], 'embeddings': None, 'documents': [None], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'name': 'Vinnie'}]}


In [13]:
print("Fetching data with ID 1:", collection.get(ids=["1"]))

Fetching data with ID 1: {'ids': ['1'], 'embeddings': None, 'documents': [None], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'name': 'Vinnie'}]}


In [14]:
collection.delete(ids=["3"])
print("After Deletion:", collection.get(ids=["3"]))

After Deletion: {'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


In [17]:
for collection in client.list_collections():
    client.delete_collection(collection.name)

In [18]:
print("Available Collections:", client.list_collections())

Available Collections: []


In [19]:
pip install python-dotenv openai

Note: you may need to restart the kernel to use updated packages.


In [21]:
client = chromadb.PersistentClient(path="./chroma_db") # This persists data
collection = client.create_collection(name="collection2")

In [30]:
from dotenv import load_dotenv

load_dotenv()     # Load environment variables from .env file
import os
api_key = os.getenv("OPENAI_API_KEY")  # Retrieve the key


In [10]:
import openai

def get_openai_embedding(text):
    response = openai.embeddings.create(input=[text], model="text-embedding-3-small") # OpenAI's embedding model
    return response.data[0].embedding

import numpy as np

def get_mock_embedding(text, dim=1536):
    # Generate a random embedding (same dimension as text-embedding-3-small)
    return np.random.rand(dim).tolist()

In [33]:
documents = [
    "The Eiffel Tower is located in Paris.",
    "The Colosseum is in Rome, Italy.",
    "The Taj Mahal is a famous monument in India.",
    "Mount Everest is the highest mountain in the world.",
    "Python is a popular programming language."
]

# Convert documents to embeddings
embeddings = [get_mock_embedding(doc) for doc in documents]

# Insert into chromaDB
collection.add(
    ids=[str(i) for i in range(len(documents))],  # Unique IDS
    documents=documents,
    embeddings=embeddings
)
print("Data added successfully!")

Data added successfully!


In [37]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_mock_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2, # get top 3 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Results:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Results: ['Python is a popular programming language.', 'The Eiffel Tower is located in Paris.']
Distance: [246.53121948242188, 247.85276794433594]


In [38]:
updated_text = "The Eiffel Tower is one of the most visited landmarks in the world."
updated_embedding = get_mock_embedding(updated_text)

collection.update(
    ids=["0"], # ID of the document to update
    documents=[updated_text],
    embeddings=[updated_embedding]
)

print("Data updated successfully!")


Data updated successfully!


In [39]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_mock_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2, # get top 3 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Results:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Results: ['Python is a popular programming language.', 'The Taj Mahal is a famous monument in India.']
Distance: [246.24765014648438, 249.3199920654297]


In [40]:
tower_ht_text = "Eiffel Tower is 300 tall."
collection.add(
    ids=["6"],
    embeddings=get_mock_embedding(tower_ht_text),
    documents=tower_ht_text
)

In [41]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_mock_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2, # get top 3 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Results:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Results: ['Mount Everest is the highest mountain in the world.', 'The Taj Mahal is a famous monument in India.']
Distance: [248.51759338378906, 253.14785766601562]


In [42]:
collection.delete(ids=["0"])
print("Data deleted successfully!")


Data deleted successfully!


In [44]:
query_text = "Where is the Eiffel Tower?"
query_embedding = get_mock_embedding(query_text)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2,  # Get top 2 matches
    include=["documents", "distances"]
)

print("Query:", query_text)
print("Most Similar Result:", results["documents"][0])
print("Distance:", results["distances"][0])

Query: Where is the Eiffel Tower?
Most Similar Result: ['Mount Everest is the highest mountain in the world.', 'Python is a popular programming language.']
Distance: [247.52993774414062, 249.34596252441406]


In [12]:
from dotenv import load_dotenv

load_dotenv() # Load environment variables from .env file
import os
open_ai= os.getenv('OPENAI_API_KEY')
long_cahin = os.getenv('langchainKey')
groq_key = os.getenv('GROQ_API_KEY')
hugging_key = os.getenv('hugging_face')
os.environ["USER_AGENT"] = "my-crawler-bot/1.0"

In [47]:
pip install langchain_community langchain-openai langchainhub chromadb langchain

Collecting langchain-openai
  Downloading langchain_openai-0.3.32-py3-none-any.whl.metadata (2.4 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting packaging>=23.2 (from langchain-core<2.0.0,>=0.3.75->langchain_community)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)
  Downloading types_requests-2.32.4.20250809-py3-none-any.whl.metadata (2.0 kB)
Downloading langchain_openai-0.3.32-py3-none-any.whl (74 kB)
Downloading tiktoken-0.11.0-cp312-cp312-macosx_11_0_arm64.whl (996 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.7/996.7 kB[0m [31m447.1 kB/s[0m  [33m0:00:02[0meta [36m0:00:03[0m
[?25hDownloading langchainhub-0.1.21-py3-none-any.whl (5.2 kB)
Using cached packaging-24.2-py3-none-an

In [11]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths = ["https://www.educosys.com/course/genai"]
)
docs = loader.load()
print(docs)

[Document(metadata={'source': 'https://www.educosys.com/course/genai', 'title': 'Hands-on Generative AI Course', 'description': 'Hands-on Generative AI Course', 'language': 'en'}, page_content="Hands-on Generative AI CourseCoursesBundle CoursesStudent DiscountFree ContentTestimonialsFAQLogin Signup Starts on 16th September 2025Hands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 weeks · 3 classes/week · 2 hrs/class + Post-class Doubt SupportClasses on Tue, Wed, Thurs - 9PM ISTAccess all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project - Build a Simple Neural Network Using TensorFlow Mini Project - Train an Autoencoder on the MNIST Dataset2Week 

In [2]:
pip install beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.5-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.8-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.13.5-py3-none-any.whl (105 kB)
Downloading soupsieve-2.8-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [beautifulsoup4]
[1A[2KSuccessfully installed beautifulsoup4-4.13.5 soupsieve-2.8
Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split
# Overlap of 200 characters to maintain context across chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [3]:
print(splits[0])
print(splits[1])
print(splits[2])

page_content='Hands-on Generative AI CourseCoursesBundle CoursesStudent DiscountFree ContentTestimonialsFAQLogin Signup Starts on 16th September 2025Hands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 weeks · 3 classes/week · 2 hrs/class + Post-class Doubt SupportClasses on Tue, Wed, Thurs - 9PM ISTAccess all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project - Build a Simple Neural Network Using TensorFlow Mini Project - Train an Autoencoder on the MNIST Dataset2Week 2Deep Generative Models Discriminative and Generative models Generative Adversarial Networks (GANs) Variational Autoencoders (VAEs) Probabilistic Data Generation Using VAEs Four Mi

In [4]:
# from langchain_community.vectorstores import Chroma
# from langchain_openai import OpenAIEmbeddings
# from langchain_community.embeddings import HuggingFaceEmbeddings
# embedding = HuggingFaceEmbeddings(
#     model_name="sentence-transformers/paraphrase-MiniLM-L3-v2",
#     model_kwargs={'device': 'cuda'},
#     encode_kwargs={'batch_size': 128}
# )

# vectorstore = Chroma.from_documents(
#     documents=splits,
#     embedding=embedding
# )

from langchain_community.retrievers import BM25Retriever

retriever = BM25Retriever.from_documents(splits)

In [7]:
print((retriever.docs)) # Check total stored chunks

[Document(metadata={'source': 'https://www.educosys.com/course/genai', 'title': 'Hands-on Generative AI Course', 'description': 'Hands-on Generative AI Course', 'language': 'en'}, page_content='Hands-on Generative AI CourseCoursesBundle CoursesStudent DiscountFree ContentTestimonialsFAQLogin Signup Starts on 16th September 2025Hands-on Generative AI CourseLearn, Build, Deploy and Apply Generative AI7 weeks · 3 classes/week · 2 hrs/class + Post-class Doubt SupportClasses on Tue, Wed, Thurs - 9PM ISTAccess all Live BatchesLifetime access of RecordingsAccess Discord CommunityCode availableBuild ProjectsLearn Future-Ready TechEnroll 1Week 1Foundations of Generative AI Introduction to AI Mathematical Foundations for AI Probability, Statistics, and Linear Algebra Basics of Neural Networks Gradient Descent and Optimization Basics Architectures: Feedforward, RNN, and CNN Mini Project - Build a Simple Neural Network Using TensorFlow Mini Project - Train an Autoencoder on the MNIST Dataset2Week 

In [8]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [8]:
from langchain import hub
# Prompt
prompt = hub.pull("rlm/rag-prompt") # pulls a predefined RAG prompt template from LangChain Hub

In [19]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-4o-mini")

In [20]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [21]:
rag_chain = ({"context" : retriever | format_docs, "question": RunnablePassthrough()}
             | prompt
             | llm
             | StrOutputParser())


In [22]:
rag_chain.invoke("What are the timings of the genai course?")

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [23]:
rag_chain.invoke("Give me the curriculum for week 1 for genai course")

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [24]:
rag_chain.invoke("Are the recordings for the course available?")

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [25]:
from langchain_core.runnables import RunnableLambda

In [26]:
def print_prompt(prompt_text):
    print("\nPrompt - ", prompt_text)
    return prompt_text

rag_chain = ({"context" : retriever | format_docs, "question": RunnablePassthrough()}
             | prompt
             | RunnableLambda(print_prompt)
             | llm
             | StrOutputParser())

In [27]:
rag_chain.invoke("What are the timings of the genai course?")


Prompt -  messages=[HumanMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: What are the timings of the genai course? \nContext: knowledge of Machine Learning. Keerti’s passion for teaching made complex topics easy to grasp. I highly recommend this course to anyone interested in AI and ML!Read moreManika KaushikSenior Software EngineerOptum-United HealthGroupKeerti explains everything in such simple and creative manner, even difficult and huge topics became easy to understand.Frequently asked questionsIs this a Live or Recorded Course?When will the next Live batch be launched?What if I am interested in learning Live only?What are the prerequisites for the course?Is Machine Learning pre-requisite for the course?How many projects will we work on? Can I add these to resume?Is 

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}