In [None]:
# --- Professional RAG, SAG, and CAG Demo Notebook ---
# This notebook demonstrates advanced Retrieval-Augmented Generation (RAG), Sequential Answer Generation (SAG), and Chunked Answer Generation (CAG) pipelines using LangChain, ChromaDB, and Google Gemini (Gemini 1.5 Flash).
#
# Author: [Your Name]
# Date: 21 September 2025
#
# ---
#
# RAG (Retrieval-Augmented Generation):
#   - Use Case: Enhance LLMs with external knowledge from documents, PDFs, or databases.
#   - Example: Answering questions about a research paper by retrieving relevant sections and generating a context-aware response.
#
# SAG (Sequential Answer Generation):
#   - Use Case: Decompose complex queries into sub-questions, retrieve evidence for each, and synthesize a comprehensive answer.
#   - Example: Multi-step reasoning, such as breaking down 'How does attention work in transformers?' into smaller, answerable parts.
#
# CAG (Chunked Answer Generation):
#   - Use Case: Summarize multiple retrieved chunks individually, then aggregate summaries to answer the main question.
#   - Example: Efficient for long documents or when concise, multi-perspective answers are needed.
#
# ---
#
# The following cell installs all required libraries for the notebook. If running locally, ensure you have the necessary permissions.

!pip install langchain langchain-community langchain-core
!pip install chromadb
!pip install sentence-transformers
!pip install pypdf2
!pip install google-generativeai
!pip install tiktoken



In [None]:
# --- Import Required Libraries ---
# PyPDF2: For extracting text from PDF documents
# sentence_transformers: For generating embeddings from text
# LangChain modules: For text splitting, embeddings, vector storage, LLM integration, and retrieval QA
# chromadb: For efficient vector database storage and retrieval
#
# These imports enable the core RAG, SAG, and CAG workflows.
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import GooglePalm
from langchain.chains import RetrievalQA
import chromadb

In [None]:
# --- PDF Content Extraction Utility ---
# This function extracts all text from a PDF file, enabling downstream processing for RAG/SAG/CAG pipelines.
# Use Case: Ingest academic papers, reports, or any PDF-based knowledge source.
def extract_pdf_content(pdf_path):
  text = ""
  with open(pdf_path, "rb") as pdf_file:
    pdf_reader = PdfReader(pdf_file)
    for page in pdf_reader.pages:
      text += page.extract_text()
  return text

In [None]:
# --- Text Chunking Configuration ---
# SentenceTransformersTokenTextSplitter splits long documents into manageable, semantically meaningful chunks based on token count.
# Use Case: Ensures each chunk fits within LLM context windows, improving retrieval and answer quality.
sent_text_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap=10, # Overlap tokens for context continuity
    model_name='sentence-transformers/all-MiniLM-L6-v2', # Embedding model for tokenization
    tokens_per_chunk=100 # Chunk size (tunable for your use case)
)

In [None]:
# --- Chunking Text with Metadata ---
# This function splits extracted text into chunks and attaches metadata (e.g., filename) to each chunk.
# Use Case: Enables traceability and source attribution in RAG/SAG/CAG answers.
def chunk_text(text,file_name):
  chunks = []
  for chunk in sent_text_splitter.split_text(text):
    chunks.append({"content":chunk,
                   "metadata":{"filename":file_name}})
  return chunks

In [None]:
# --- Embedding Model Initialization ---
# HuggingFaceEmbeddings generates dense vector representations for semantic search and retrieval.
# Use Case: Powers similarity search in ChromaDB for RAG, SAG, and CAG pipelines.
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# --- Store Chunks in ChromaDB ---
# This function persists text chunks and their metadata in a local Chroma vector store for fast similarity search.
# Use Case: Enables scalable, persistent retrieval for RAG, SAG, and CAG workflows.
def store_in_chroma(chunks,persist_directory="./chroma_store"):
  texts = [c["content"] for c in chunks]
  metadatas = [c["metadata"] for c in chunks]
  db = Chroma.from_texts(texts, embedding_model,metadatas=metadatas, persist_directory=persist_directory)
  return db

In [None]:
# --- Ingest PDF and Prepare Chunks ---
# Example: Ingest the 'attention_is_all.pdf' paper, extract its text, split into chunks, and store in ChromaDB.
# This step is foundational for all downstream RAG, SAG, and CAG operations.
pdf_path = "attention_is_all.pdf"
filename = pdf_path.split("/")[-1]
text = extract_pdf_content(pdf_path)
chunks = chunk_text(text,filename)
db = store_in_chroma(chunks)

In [None]:
# --- Semantic Search Utility ---
# This function performs a similarity search in ChromaDB, retrieving the most relevant chunks for a given query.
# Use Case: Core retrieval step for RAG, SAG, and CAG pipelines.
def search_chroma(query,db,top_k=5):
  results = db.similarity_search(query,k=top_k)
  chunks = [{"content":d.page_content,"metadata":d.metadata} for d in results]
  return chunks

In [None]:
# --- Google Generative AI Setup ---
# Import Google Gemini (Generative AI) and Colab userdata for API key management.
# Use Case: Gemini LLM powers answer generation in RAG, SAG, and CAG pipelines.
import google.generativeai as genai
from google.colab import userdata

In [None]:
# --- Configure Gemini API Key ---
# Securely load your Google Gemini API key from Colab userdata.
# Use Case: Keeps credentials safe and enables authenticated LLM access.
api_key = userdata.get('G_API')
genai.configure(api_key=api_key)

In [None]:
# --- RAG: Retrieval-Augmented Generation Pipeline ---
# This function implements the classic RAG workflow: retrieve relevant chunks, build a context, and generate an answer using Gemini.
# Use Case: Direct Q&A over documents, with source attribution for transparency.
import google.generativeai as genai
from google.colab import userdata

def rag_answer(query,db,top_k=5):
  # Step 1: Retrieve relevant chunks from ChromaDB
  chunks = search_chroma(query,db,top_k)
  context = "\n\n".join([c["content"] for c in chunks])
  prompt = f"""Use the following context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

  {context}

  Question: {query}
  Answer:"""

  # Step 2: Generate answer using Gemini LLM
  model = genai.GenerativeModel('gemini-1.5-flash-latest')
  response = model.generate_content(prompt)
  answer = response.text
  # Step 3: Extract filenames for source transparency
  filenames = [c["metadata"]["filename"] for c in chunks if isinstance(c.get("metadata"), dict) and "filename" in c["metadata"]]
  filenames = set(filenames)
  return answer, filenames

In [None]:
# --- Example: RAG Pipeline in Action ---
# Use Case: Ask a direct question about the ingested document and receive a context-aware answer with source attribution.
rag_answer("What is attention",db)

('Based on the provided text, attention is a function that maps a query and a set of key-value pairs to an output vector.  The output is a weighted sum of the values, where the weight for each value is determined by a compatibility function comparing the query and the corresponding key.  The text also mentions scaled dot-product attention as a specific type of attention.\n',
 {'attention_is_all.pdf'})

In [None]:
# --- SAG: Sequential Answer Generation Pipeline ---
# This function decomposes complex queries into sub-questions, retrieves evidence for each, and synthesizes a comprehensive answer.
# Use Case: Multi-step reasoning, research assistance, and answering layered questions.
import google.generativeai as genai
from google.colab import userdata

def sag_answer(query, db, top_k=5):
    # Step 1: Decompose the main question into sub-questions using Gemini
    decomp_prompt = f"""
Break the following question into 3-5 concise sub-questions (numbered):
{query}
"""
    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    decomp_resp = model.generate_content(decomp_prompt)
    subqs = [line.split('.',1)[-1].strip() for line in decomp_resp.text.splitlines() if line.strip()]

    # Step 2: Retrieve and answer each sub-question
    sub_answers = []
    used_files = set()
    for sq in subqs:
        chunks = search_chroma(sq, db, top_k)
        # Track source files for transparency
        used_files.update([c["metadata"]["filename"] for c in chunks if isinstance(c.get("metadata"), dict) and "filename" in c["metadata"]])

        context = "\n\n".join([c["content"] for c in chunks])
        prompt = f"""
Using ONLY the context below, answer concisely the sub-question.
Context:
{context}

Sub-question: {sq}

Answer:
"""
        resp = model.generate_content(prompt)
        sub_answers.append({"subq": sq, "answer": resp.text})

    # Step 3: Aggregate sub-answers into a final response
    agg_context = "\n".join([f"Sub-question: {s['subq']}\nAnswer: {s['answer']}" for s in sub_answers])
    final_prompt = f"""
Combine the following sub-answers into a coherent, concise answer to the original question:
Original Question: {query}

Sub-answers:
{agg_context}

Answer:
"""
    final_resp = model.generate_content(final_prompt)

    return final_resp.text, list(used_files)

In [None]:
# --- Example: SAG Pipeline in Action ---
# Use Case: Decompose and answer a complex question using multi-step reasoning and evidence aggregation.
sag_answer("What is attention",db)

('Based on the provided text, attention is described as a computational mechanism, specifically within the context of machine learning.  This description focuses on implementations like encoder-decoder and scaled dot-product attention, involving queries, keys, values, and weighted sums.  However, the text does not address the cognitive aspects of attention, such as its relationship to consciousness, different types beyond those used in machine learning (e.g., selective, divided, sustained), its neural and cognitive control mechanisms, or its effects on perception and performance.\n',
 ['attention_is_all.pdf'])

In [None]:
# --- CAG: Chunked Answer Generation Pipeline ---
# This function summarizes each retrieved chunk individually, then aggregates the summaries to answer the main question.
# Use Case: Efficient for long documents, multi-perspective answers, or when concise summaries are needed.
import google.generativeai as genai
from google.colab import userdata

def cag_answer(query, db, top_k=5):
    # Step 1: Retrieve relevant chunks
    chunks = search_chroma(query, db, top_k)
    model = genai.GenerativeModel('gemini-1.5-flash-latest')

    summarized_chunks = []
    used_files = set()
    for c in chunks:
        # Track source files for transparency
        if isinstance(c.get("metadata"), dict) and "filename" in c["metadata"]:
          used_files.add(c["metadata"]["filename"])

        prompt = f"""
Summarize the following context in 2 short sentences. Mention filename at the top.
Context:
{c['content']}

Summary:
"""
        resp = model.generate_content(prompt)
        summarized_chunks.append(resp.text)

    # Step 2: Aggregate summaries and answer the main question
    final_context = "\n\n".join(summarized_chunks)
    final_prompt = f"""
Use the following summaries to answer the question concisely:
{final_context}

Question: {query}

Answer:
"""
    final_resp = model.generate_content(final_prompt)

    return final_resp.text, list(used_files)

In [None]:
# --- Example: CAG Pipeline in Action ---
# Use Case: Summarize and answer a question using concise, multi-perspective evidence from the document.
cag_answer("What is attention",db)

('Attention is a mechanism that maps queries and key-value pairs to an output vector, a weighted sum of values where weights are determined by a query-key compatibility function.  In neural machine translation, it allows a decoder to access all input sequence information. Multi-head attention, used in Transformers, enhances this by having individual heads perform distinct tasks, often reflecting syntactic and semantic sentence structure.  The optimal number of attention heads is crucial for performance.\n',
 ['attention_is_all.pdf'])

In [None]:
# # Conclusion & LinkedIn Post Guidance


# This notebook demonstrates state-of-the-art Retrieval-Augmented Generation (RAG), Sequential Answer Generation (SAG), and Chunked Answer Generation (CAG) pipelines for advanced document Q&A and knowledge extraction. Each approach is tailored for specific real-world use cases, from direct Q&A to multi-step reasoning and efficient summarization.


# **Key Takeaways:**


# - **RAG**: Best for direct, context-rich answers from large document collections.
# - **SAG**: Ideal for complex, multi-part questions requiring stepwise reasoning.
# - **CAG**: Perfect for summarizing and synthesizing information from multiple sources or long documents.


# Feel free to connect with me on LinkedIn for more insights on LLMs, RAG, and enterprise AI solutions! 🚀