In [None]:
!pip install langchain chromadb openai tiktoken -q
!pip install chromadb  sentence-transformers -qU


In [None]:
from sentence_transformers import SentenceTransformer

# You can use any free model here
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_hf_embedding(text):
    return model.encode(text).tolist()


In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(
    persist_directory="./chroma_db"  # saves to local folder
))

collection = client.get_or_create_collection(name="knowledge_base")

# Example texts to store
texts = [
    "Python is a versatile programming language.",
    "LangChain is used to build LLM-based applications.",
    "ChromaDB is a lightweight open-source vector database."
]

# Choose one embedding method (Gemini or HuggingFace)
embeddings = [get_hf_embedding(t) for t in texts]  # or get_gemini_embedding(t)

collection.add(
    documents=texts,
    ids=[f"id{i}" for i in range(len(texts))],
    embeddings=embeddings
)

print("‚úÖ Data inserted successfully!")


‚úÖ Data inserted successfully!


In [None]:
query = "What is ChromaDB used for?"
query_embedding = get_hf_embedding(query)  # or get_gemini_embedding(query)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=2
)

for doc in results['documents'][0]:
    print("üîπ", doc)


üîπ ChromaDB is used to store and retrieve text embeddings efficiently.
üîπ ChromaDB is a lightweight open-source vector database.


#lanchain

In [1]:
!pip install langchain chromadb sentence-transformers google-generativeai langchain-core langchain-community -qU


In [2]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_text(texts):
    return embedding_model.encode(texts).tolist()


KeyboardInterrupt: 

In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(persist_directory="./chroma_rag"))
collection = client.get_or_create_collection("rag_knowledge")

# Add some data (knowledge base)
documents = [
    "LangChain is a framework to build applications powered by large language models.",
    "ChromaDB is used to store and retrieve text embeddings efficiently.",
    "Retrieval-Augmented Generation combines document retrieval with text generation.",
    "HuggingFace models can generate embeddings locally without using an API key."
]

collection.add(
    documents=documents,
    ids=[f"doc_{i}" for i in range(len(documents))],
    embeddings=embed_text(documents)
)

print("‚úÖ Knowledge base created successfully!")


In [None]:
!pip install -U langchain-chroma -q

In [None]:
# from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain.embeddings.base import Embeddings

class HFEmbedding(Embeddings):
    def embed_documents(self, texts):
        return embed_text(texts)

    def embed_query(self, text):
        return embed_text([text])[0]

# Create retriever
vectorstore = Chroma(
    client=client,
    collection_name="rag_knowledge",
    embedding_function=HFEmbedding()
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})


In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9caRAI6oChsXLC7uDWB8FcSPcEP_oixg"

In [None]:
import google.generativeai as genai
import os

genai.configure(api_key="AIzaSyD9caRAI6oChsXLC7uDWB8FcSPcEP_oixg")

def generate_answer(context, question):
    prompt = f"""Answer the question based only on the following context:
    {context}
    Question: {question}"""

    response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
    return response.text


In [None]:
generate_answer("age i just 13",'waht is my age')

In [None]:
def ask_question(question):
    # Step 1: Retrieve relevant docs
    docs = retriever.invoke(question)
    context = "\n".join([d.page_content for d in docs])
    # print(context)
    # Step 2: Generate answer using Gemini
    answer = generate_answer(context, question)
    return answer

# Example Query
response = ask_question("What is the purpose of ChromaDB?")
print("ü§ñ Answer:", response)

In [None]:
print(collection.count())


7


#pdf and file with langchain

In [1]:
!pip install langchain chromadb sentence-transformers pypdf langchain-community google-generativeai -qU


In [2]:
import os
import chromadb
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
from langchain.embeddings.base import Embeddings

In [3]:
# Choose your file
pdf_path = "/content/major_proj_report.pdf"

# Load PDF pages
loader = PyPDFLoader(pdf_path)
documents = loader.load()

print(f"‚úÖ Loaded {len(documents)} pages from {pdf_path}")


‚úÖ Loaded 66 pages from /content/major_proj_report.pdf


In [4]:
(documents)

[Document(metadata={'producer': 'Microsoft¬Æ Word 2021', 'creator': 'Microsoft¬Æ Word 2021', 'creationdate': '2025-11-05T11:45:45+05:30', 'title': 'Prepared by Dr. S. B. Warkad, M.Tech. (IDC) Coordinator for Students', 'author': 'p-4', 'moddate': '2025-11-05T11:45:45+05:30', 'source': '/content/major_proj_report.pdf', 'total_pages': 66, 'page': 0, 'page_label': '1'}, page_content='AI VISION TRACKER \n \nA \nMinor Project Report \nSubmitted in partial fulfillment of the requirement for the award of degree of \n \nBachelor of Technology \nIn \nComputer Science & Engineering \n \nSubmitted to \nRAJIV GANDHI PROUDYOGIKI VISHWAVIDYALAYA,  \nBHOPAL (M.P.) \n \n \nGuided by                Submitted By  \nMr. Paras Bhanopiya                                                           Pooja  Parmar (0832CS221150)  \nAssistant professor               Ritesh Parmar (0832CSS221165) \n                                        Ritika Mahajan(0832CS221166)  \n         \n  \n \n     \nDEPARTMENT OF COMPUT

In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,       # characters per chunk
    chunk_overlap=100,    # overlap for context continuity
    length_function=len
)

chunks = splitter.split_documents(documents)

print(f"‚úÖ Split into {len(chunks)} text chunks")


‚úÖ Split into 191 text chunks


In [6]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_text(texts):
    return embedding_model.encode(texts).tolist()

from langchain.embeddings.base import Embeddings

class HFEmbedding(Embeddings):
    def embed_documents(self, texts):
        return embed_text(texts)
    def embed_query(self, text):
        return embed_text([text])[0]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
from chromadb.config import Settings

client = chromadb.Client(Settings(persist_directory="./rag_db"))
collection_name = "pdf_knowledge"

vectorstore = Chroma(
    client=client,
    collection_name=collection_name,
    embedding_function=HFEmbedding()
)

# Add chunks to vectorstore
texts = [chunk.page_content for chunk in chunks]
metadatas = [chunk.metadata for chunk in chunks]

vectorstore.add_texts(texts=texts, metadatas=metadatas)

print("‚úÖ ChromaDB populated successfully with PDF chunks!")


‚úÖ ChromaDB populated successfully with PDF chunks!


In [8]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


In [12]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyD9caRAI6oChsXLC7uDWB8FcSPcEP_oixg"

In [16]:
import google.generativeai as genai

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

def generate_answer(context, question):
    prompt = f"""Answer the question based only on the following context:
    {context}
    Question: {question}"""

    response = genai.GenerativeModel("gemini-2.5-flash").generate_content(prompt)
    return response.text


In [17]:
def ask_question(question):
    docs = retriever.invoke(question)
    context = "\n\n".join([d.page_content for d in docs])
    answer = generate_answer(context, question)
    return answer

question = "What does the document say about neural networks?"
response = ask_question(question)
print("ü§ñ Answer:", response)


ü§ñ Answer: The document states that the model was trained using:
*   **Deep learning frameworks** such as TensorFlow and PyTorch.
*   **Object detection architectures** like YOLOv8 or MobileNet SSD.

It also mentions using **loss functions** (like cross-entropy and IoU loss) for optimization and running multiple **epochs** with gradually adjusted **learning rates** for stability. The experimentation phase involved testing different configurations of **batch size**, **learning rate**, and dataset splits.

One of the referenced publications is titled "Deep Learning-Based Surveillance Systems for Smart Cities."


In [18]:
print("Total chunks:", vectorstore._collection.count())
docs = vectorstore._collection.get(limit=2)
print(docs["documents"][0][:200])  # first 200 chars of first chunk



Total chunks: 191
AI VISION TRACKER 
 
A 
Minor Project Report 
Submitted in partial fulfillment of the requirement for the award of degree of 
 
Bachelor of Technology 
In 
Computer Science & Engineering 
 
Submitted 
