<a href="https://colab.research.google.com/github/Rahulchunduruu/Machine-Learning/blob/main/Ragg_basic_appliaction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install necessary libraries for RAG (Retrieval Augmented Generation) capabilities.
# sentence-transformers: For embedding models.
# langchain, langchain-community, langchain-core, langchain_google_genai, langchain_chroma: Core LangChain components for building LLM applications, including integrations for Google Generative AI and ChromaDB.
# faiss-cpu: A library for efficient similarity search and clustering of dense vectors (though FAISS is later replaced by ChromaDB).
# pypdf: For loading PDF documents.
!pip install -q sentence-transformers langchain faiss-cpu langchain-community pypdf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.5/329.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[3

In [None]:
# Install additional LangChain-related libraries, ensuring specific components are available.
# langchain: The main LangChain library.
# langchain-community: Community contributed LangChain integrations.
# langchain-core: Core LangChain functionalities.
# langchain_google_genai: Integration for Google's Generative AI models.
# langchain_chroma: Integration for Chroma vector database.
!pip install langchain langchain-community langchain-core langchain_google_genai langchain_chroma

In [49]:
# Import necessary modules from LangChain and other libraries.
# PyPDFLoader: To load PDF documents.
# os: For operating system functionalities (though not strictly used in this cell, it's good practice for API keys).
# ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings: For interacting with Google's Generative AI models (chat and embeddings).
# Chroma: The vector database for storing and retrieving document embeddings.
# Document: A class to represent a document with content and metadata.
# ChatPromptTemplate: For defining structured prompts for chat models.
# RunnablePassthrough: A LangChain component to pass inputs through a chain.
# StrOutputParser: To parse the string output from the LLM.
from langchain_community.document_loaders import PyPDFLoader
import os
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [50]:
# Load the PDF document using PyPDFLoader.
# The loader takes the path to the PDF file.
# loader.load() reads the content of the PDF and returns a list of Document objects, where each page is a separate document.
loader = PyPDFLoader("/content/OceanofPDF.com_The_Genius_Myth_-_Helen_Lewis.pdf")
documents = loader.load()
print(f"loaded {len(documents)} pages from ")

loaded 287 pages from 


In [51]:
# Print the type of the 'documents' variable to confirm it's a list.
print(type(documents))

<class 'list'>


In [52]:
# Print the content and metadata of the first document (page) to inspect its structure.
print(documents[0])

page_content='' metadata={'producer': 'calibre 7.4.0', 'creator': 'calibre 7.4.0', 'creationdate': '2025-06-19T23:32:49+00:00', 'author': 'Helen Lewis', 'moddate': '2025-06-19T23:32:49+00:00', 'title': 'The Genius Myth: A Curious History of a Dangerous Idea', 'source': '/content/OceanofPDF.com_The_Genius_Myth_-_Helen_Lewis.pdf', 'total_pages': 287, 'page': 0, 'page_label': '1'}


In [53]:
# Split the loaded documents into smaller, manageable chunks.
# CharacterTextSplitter: A text splitter that splits text based on characters.
# chunk_size = 500: Each chunk will have a maximum of 500 characters.
# chunk_overlap = 50: There will be an overlap of 50 characters between consecutive chunks to maintain context.
# The 'chunks' variable will store the list of these smaller document parts.
from langchain_text_splitters import CharacterTextSplitter

splitter = CharacterTextSplitter(chunk_size = 500,chunk_overlap = 50)
chunks = splitter.split_documents(documents)
print(f"split into {len(chunks)} chunks")

split into 285 chunks


In [54]:
# Initialize a HuggingFace embedding model.
# HuggingFaceEmbeddings: A class to use models from HuggingFace for generating embeddings.
# model_name = "all-MiniLM-L6-v2": Specifies the pre-trained model to use for embeddings, known for its efficiency and good performance.
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

In [55]:
# Create a FAISS vector database from the document chunks.
# FAISS: A library for efficient similarity search and clustering of dense vectors.
# from_documents: A method to create a FAISS index directly from a list of documents and an embedding model.
# The 'chunks' (smaller parts of the PDF) are converted into vector embeddings using 'embedding_model' and stored in 'vector_db'.
from langchain_community.vectorstores import FAISS

vector_db = FAISS.from_documents(chunks, embedding=embedding_model)

In [64]:
# Perform a similarity search in the FAISS vector database.
# query: The search query (this line is commented out, assuming 'query' is defined elsewhere or will be from user input).
# relevant_docs: Stores the top 'k' most similar documents found in the vector database.
# k = 3: Specifies to retrieve the 3 most relevant document chunks.
# The loop then prints the content of each retrieved chunk.
#query = input("enter your question")

relevant_docs = vector_db.similarity_search(query,k = 4)

for i,doc in enumerate(relevant_docs,1):
  print(f" chunk{i} ----\n{doc.page_content}\n")

 chunk1 ----
ALSO  BY  HELEN  LEWIS
Difficult Women
The Spark
OceanofPDF.com

 chunk2 ----
Finally, to the one person who makes everything possible: Jonathan. I
promise I won’t write another book. Unless I have a really, really good idea.
No! I promise. Probably.
OceanofPDF.com

 chunk3 ----
4. James Baldwin, radio interview with Studs Terkel, 1961. Available at
https://studsterkel.wfmt.com/programs/james-baldwin-discusses-his-
book-nobody-knows-my-name-more-notes-native-son
BACK TO NOTE REFERENCE 4
OceanofPDF.com

 chunk4 ----
outstanding book Control deals with the history of eugenics, was a cheering
presence; who else could I talk to about Francis Galton? Stuart Ritchie
kindly helped me understand some of the key bits of intelligence research.
Chris Morris explained Prince’s musical magic and told me about Galton’s
cake trick. Cordelia Fine sent me useful links on sex differences and
intelligence. Chris Kavanagh was a sounding board on the subject of
charismatic charlatans. Craig Br

In [65]:
# Print the FAISS vector database object to confirm its creation and type.
print(vector_db)

<langchain_community.vectorstores.faiss.FAISS object at 0x7964b537fc20>


In [66]:
# Import the 'os' module, which provides a way of using operating system dependent functionality.
# This is often used for managing environment variables or file paths.
import os

In [67]:
import os
# Set the Google API key as an environment variable.
# This is crucial for authenticating with Google services like Generative AI.
os.environ["GOOGLE_API_KEY"] = "AIzaSyD5mrY_jD_LLWf4P3uLjLgSmFqTWHK3jU4"

# --- 1. SETUP MODELS ---
# We need two models:
# A. Embedding Model: Converts text into vector numbers.
# GoogleGenerativeAIEmbeddings: Uses Google's model to convert text into numerical vector representations.
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# B. LLM: The chat model that will answer the question.
# ChatGoogleGenerativeAI: Initializes Google's Gemini 2.0 Flash chat model.
# temperature=0: Sets the model's creativity level to minimum, making responses more deterministic.
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0
)

print("--- Initializing Models and Embeddings ---")

--- Initializing Models and Embeddings ---


In [68]:
# --- 3. BUILD VECTOR STORE (The 'Link') ---
# This step:
# 1. Takes the document chunks.
# 2. Uses the 'embeddings' model to turn them into vectors.
# 3. Stores them in ChromaDB (in memory for this example).
# Chroma.from_documents: Creates a Chroma vector store from the 'chunks' (processed document parts).
# documents=chunks: The list of document chunks to be embedded and stored.
# embedding=embeddings: The embedding model to use for converting text to vectors.
# collection_name="gemini_knowledge_base": A name for the collection of documents within ChromaDB.
vectorstore = Chroma.from_documents(
    documents=chunks, # Use the 'chunks' created earlier
    embedding=embeddings,
    collection_name="gemini_knowledge_base"
)

In [69]:
# Create a "Retriever" interface from the Chroma vector store.
# vectorstore.as_retriever: Configures the vector store to act as a retriever.
# search_kwargs={"k": 2}: Specifies that the retriever should return the top 2 most relevant document chunks.
retriever = vectorstore.as_retriever(search_kwargs={"k": 2}) # Retrieve top 2 results

# --- 4. DEFINE THE RAG CHAIN ---
# This is the "brain" that connects the DB to the LLM.

# A simple prompt template for the LLM.
# {context}: Placeholder for the retrieved relevant document chunks.
# {question}: Placeholder for the user's query.
template = """
Answer the question based ONLY on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# A helper function to format the retrieved documents into a single string.
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

# The RAG Chain definition:
# This chain orchestrates the RAG process:
# 1. "context": retriever | format_docs: The retriever fetches relevant documents, which are then formatted.
# 2. "question": RunnablePassthrough(): The original question is passed through as is.
# 3. | prompt: The formatted context and question are passed to the prompt template.
# 4. | llm: The prompt is sent to the Large Language Model for generation.
# 5. | StrOutputParser(): The LLM's output is parsed as a simple string.
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# --- 5. RUN THE PIPELINE ---
# Define the user's query.
query = "summary about this book"
print(f"\nUser Question: {query}")
print("Thinking...")

# Invoke the RAG chain with the query to get a response.
response = rag_chain.invoke(query)

print(f"\nGemini Answer:\n{response}")

# Cleanup (optional)
# vectorstore.delete_collection() # Commented out to keep the vector store for further queries in the session.


User Question: summary about this book
Thinking...

Gemini Answer:
The provided text is an advertisement for a service that helps you find your next book. It doesn't describe a specific book, but rather offers personalized book recommendations and news about authors.
