<a href="https://colab.research.google.com/github/Saurabh1222/WebRAG/blob/main/WebRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# run in a notebook cell
!pip uninstall -y opentelemetry-api opentelemetry-sdk opentelemetry-proto \
  opentelemetry-exporter-otlp-proto-common opentelemetry-exporter-otlp-proto-http

!pip install opentelemetry-api==1.37.0 opentelemetry-sdk==1.37.0 \
  opentelemetry-proto==1.37.0 opentelemetry-exporter-otlp-proto-common==1.37.0 \
  opentelemetry-exporter-otlp-proto-http==1.37.0
# now install our stack
!pip install -q chromadb langchain sentence-transformers transformers huggingface-hub beautifulsoup4
# verify
!pip check

Found existing installation: opentelemetry-api 1.37.0
Uninstalling opentelemetry-api-1.37.0:
  Successfully uninstalled opentelemetry-api-1.37.0
Found existing installation: opentelemetry-sdk 1.37.0
Uninstalling opentelemetry-sdk-1.37.0:
  Successfully uninstalled opentelemetry-sdk-1.37.0
Found existing installation: opentelemetry-proto 1.37.0
Uninstalling opentelemetry-proto-1.37.0:
  Successfully uninstalled opentelemetry-proto-1.37.0
Found existing installation: opentelemetry-exporter-otlp-proto-common 1.37.0
Uninstalling opentelemetry-exporter-otlp-proto-common-1.37.0:
  Successfully uninstalled opentelemetry-exporter-otlp-proto-common-1.37.0
Found existing installation: opentelemetry-exporter-otlp-proto-http 1.37.0
Uninstalling opentelemetry-exporter-otlp-proto-http-1.37.0:
  Successfully uninstalled opentelemetry-exporter-otlp-proto-http-1.37.0
Collecting opentelemetry-api==1.37.0
  Using cached opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-

In [2]:
!pip install -q chromadb langchain sentence-transformers transformers huggingface-hub beautifulsoup4 langchain_community
# verify
!pip check

ipython 7.34.0 requires jedi, which is not installed.
opentelemetry-exporter-otlp-proto-http 1.37.0 has requirement opentelemetry-exporter-otlp-proto-common==1.37.0, but you have opentelemetry-exporter-otlp-proto-common 1.38.0.
opentelemetry-exporter-otlp-proto-http 1.37.0 has requirement opentelemetry-proto==1.37.0, but you have opentelemetry-proto 1.38.0.
opentelemetry-exporter-otlp-proto-http 1.37.0 has requirement opentelemetry-sdk~=1.37.0, but you have opentelemetry-sdk 1.38.0.
google-colab 1.0.0 has requirement requests==2.32.4, but you have requests 2.32.5.
google-adk 1.17.0 has requirement opentelemetry-api<=1.37.0,>=1.37.0, but you have opentelemetry-api 1.38.0.
google-adk 1.17.0 has requirement opentelemetry-sdk<=1.37.0,>=1.37.0, but you have opentelemetry-sdk 1.38.0.


In [3]:
# 1) Imports + env check
import os, requests, math
from bs4 import BeautifulSoup
from huggingface_hub import login as hf_login
import torch

In [4]:
# Set HUGGINGFACE_HUB_TOKEN in env before running (recommended)
from google.colab import userdata
token = userdata.get('HF_TOKEN')
if token:
    hf_login(token=token)
else:
    raise RuntimeError("Set HUGGINGFACE_HUB_TOKEN in environment before running.")

In [5]:
# 2) Scraper (safe)
def scrape_website(url, max_chars=200_000):
    r = requests.get(url, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    text = " ".join(p.get_text(separator=" ", strip=True) for p in soup.find_all("p"))
    return text[:max_chars]

url = "https://www.geeksforgeeks.org/artificial-intelligence/what-is-generative-ai/"
web_text = scrape_website(url)

In [6]:
# 3) Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

docs = [Document(page_content=web_text, metadata={"source": url})]
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
documents = splitter.split_documents(docs)

In [7]:
# 4) Embeddings (batched) - use all-MiniLM-L6-v2
from langchain.embeddings import HuggingFaceEmbeddings
embedding_model_name = "all-MiniLM-L6-v2"
embedder = HuggingFaceEmbeddings(model_name=embedding_model_name)

# batch embed document texts (faster & safer)
texts = [d.page_content for d in documents]
batch_size = 32
embeddings = []
for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    batch_emb = embedder.embed_documents(batch)   # returns list of vectors
    embeddings.extend(batch_emb)

  embedder = HuggingFaceEmbeddings(model_name=embedding_model_name)


In [8]:
# 5) Persist Chroma vectorstore (use Drive in Colab for persistence)
from langchain.vectorstores import Chroma
persist_dir = "./chromadb_persist"  # change to '/content/drive/MyDrive/... ' for Drive persistence
vectorstore = Chroma.from_documents(documents, embedder, persist_directory=persist_dir)
vectorstore.persist()

  vectorstore.persist()


In [9]:
# 6) Choose free HF API LLM (google/flan-t5-base)
#    We use LangChain's HuggingFacePipeline wrapper which uses the transformers pipeline.
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

hf_model = "google/flan-t5-base"   # or "google/flan-t5-small" for cheaper & faster dev

tokenizer = AutoTokenizer.from_pretrained(hf_model)
model = AutoModelForSeq2SeqLM.from_pretrained(hf_model)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    temperature=0.0,
)

llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  llm = HuggingFacePipeline(pipeline=pipe)


In [10]:
# 7) RetrievalQA chain using the LLM + Chroma retriever
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

retriever = vectorstore.as_retriever(search_kwargs={"k": 4})  # top-k retrieval

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""Answer the question using only the context below. Be short and accurate.

Context:
{context}

Question:
{question}

Answer:"""
)

rag = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

In [11]:
# 8) Query example
query = "What is Gen AI?"
out = rag({"query": query})

print("=== ANSWER ===")
print(out["result"].strip())
print("\n=== SOURCES (first chunk) ===")
if out.get("source_documents"):
    print(out["source_documents"][0].page_content[:600].strip())


  out = rag({"query": query})


=== ANSWER ===
Generative AI is trained on large datasets like text, images, audio or video using deep learning networks. During training, the model learns parameters (millions or billions of them) that help them predict or generate content. Here models generate output based on learned patterns and prompts provided Modern systems often uses agents which are autonomous components that interact with the environment, obtain information and execute chains of tasks. These agents uses LLMs to reason, plan and act enabling workflows like querying databases, outputs by retrieving relevant documents at query time to ground the generation in accurate, up-to-date

=== SOURCES (first chunk) ===
audio or video that resembles real-world examples. Generative AI is trained on large datasets like text, images, audio or video using deep learning networks. During training, the model learns parameters (millions or billions of them) that help them predict or generate content. Here models generate output ba