In [None]:
from dotenv import load_dotenv
import os 

load_dotenv()

In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
os.environ['OPENAI_KEY']

# Load the webpage

In [None]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# only keep post title, headers, and content from the full HTML

bs4_strainer = bs4.SoupStrainer()

loader = WebBaseLoader(
    web_path=("https://en.wikipedia.org/wiki/2024_Summer_Olympics"),
    bs_kwargs={"parse_only": bs4_strainer},
)

docs = loader.load()
len(docs)

In [None]:
print(docs)

In [None]:
len(docs[0].page_content)

# Load arxiv webpage

In [None]:
# %pip install arxiv
# %pip install pymupdf
# %pip install fitz

In [None]:
%pip uninstall fitz -y
%pip uninstall pymupdf -y
%pip install --upgrade pymupdf langchain


In [None]:
%pip install langchain

In [None]:
from langchain.document_loaders.arxiv import ArxivLoader

loader = ArxivLoader(query="large language models")
docs = loader.load()
print(len(docs))



In [None]:
docs

------- Loaded the documents

# Indexing or Chunking

## Splitting

In [47]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100, add_start_index=True) # add_start_index is basically prints the index starting number of the each chunk in the metadta
all_splits = text_splitter.split_documents(docs)
print(len(all_splits))

# for chunk in chunks[5]:
#     print(chunk.page_content)
#     print(chunk.metadata)

326


In [48]:
all_splits

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/2024_Summer_Olympics', 'title': '2024 Summer Olympics - Wikipedia', 'language': 'en', 'start_index': 3}, page_content='2024 Summer Olympics - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in'),
 Document(metadata={'source': 'https://en.wikipedia.org/wiki/2024_Summer_Olympics', 'title': '2024 Summer Olympics - Wikipedia', 'language': 'en', 'start_index

In [49]:
len(all_splits)

326

In [50]:
len(all_splits[0].page_content) # chunk size 500, overlap 100 but it's not exactly 600

496

-------------- Chunking is done-------------------------

next storing into vector db

# Storing into Vector db

In [None]:
# from google import genai
# import getpass
# import os

# GOOGLE_API_KEY = "AIzaSyDG4x2gW9p0CMhOEL0CoLudYBvuNAiLJnc"
# if "GOOGLE_API_KEY" not in os.environ:
#     os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# client = genai.Client()

# def get_embeddings()
#     response = client.models.embed_content(
#         model="gemini-embedding-001",
#         contents=[
#             "What is the meaning of life?",
#             "How do I bake a cake?"
#         ]
#     )

#     # Print the embeddings
#     for embedding in response.embeddings:
#         print(embedding)

In [37]:
# all_splits = all_splits[:5]
# len(all_splits)

5

In [None]:
# %pip install langchain-huggingface

In [51]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

# openai embeddings

# vectorstore = Chroma.from_documents(
#     documents=all_splits,
#     embedding=OpenAIEmbeddings()
# )

# Google gemini model embeddings

# GOOGLE_API_KEY = "AIzaSyDjqeeXDhK1icV15tqR24hZJlrZVRfujoE"
# if "GOOGLE_API_KEY" not in os.environ:
#     os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
# gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# print(gemini_embeddings)

# vectorstore = Chroma.from_documents(
#     documents=all_splits,
#     embedding=gemini_embeddings
# )

# Create Huggingface embeddings

hf_embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    # Optional: specify device for faster processing if you have a GPU
    # model_kwargs={'device': 'cuda'} 
)


In [52]:
# store huggingface allsplitts embeddings generated by all-MiniLm-L6-V2 model

vectorstore = Chroma.from_documents(
    documents=all_splits,
    embedding=hf_embeddings
)

In [53]:
vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x225be1e2000>

In [54]:
vectorstore.embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

-----chunks and embeddings are stored in chroma db ----------------------

# Retrieval

In [55]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5} # score threshold: 5
    )

In [57]:
retrieved_docs = retriever.invoke("Where is the summer olympics 2024 happening?")

In [58]:
len(retrieved_docs)

5

In [59]:
print(retrieved_docs[4].page_content)

Winter← Beijing 2022Milano Cortina 2026 →

2024 Summer Paralympics
Part of a series on2024 Summer Olympics
Bid process (bid details)
Development (venues, torch relay)
Marketing (mascots) (Olympics Go! Paris 2024)
Broadcasters
Opening ceremony (flag bearers)
Event calendar
Chronological summary
Medal table (medalists)
Controversies (Women's boxing)
World and Olympic records
Closing ceremony (flag bearers)
Paralympics
Transportation

IOC
CNOSF
COJOP2024


In [None]:
print(retrieved_docs[0].page_content)

-------------------------- Retrival is done---------------------

# Generation

In [66]:
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate


# llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

# Google api key
GOOGLE_API_KEY = "AIzaSyDG4x2gW9p0CMhOEL0CoLudYBvuNAiLJnc"
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)


In [65]:
llm

ChatGoogleGenerativeAI(model='models/gemini-2.5-flash', google_api_key=SecretStr('**********'), temperature=0.0, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x00000225BE322B10>, default_metadata=())

In [67]:
llm.invoke("who is the pm of india?")

AIMessage(content='The current Prime Minister of India is **Narendra Modi**.\n\nHe has held the office since 2014 and is a leader of the Bharatiya Janata Party (BJP).', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': []}, id='lc_run--36b7f4e0-1c06-4672-99c0-217e7b0c0dd2-0', usage_metadata={'input_tokens': 8, 'output_tokens': 36, 'total_tokens': 206, 'input_token_details': {'cache_read': 0}})

In [69]:
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, come up with an answer that sounds super realistic. Provide some evidences to make it sound real.
Use three sentences maximum and keep the answer as concise as possible.
Always say "Thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""

custom_rag_prompt = PromptTemplate.from_template(template) # here context is retrieved docs and question is user_query

In [68]:
custom_rag_prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Use the following pieces of context to answer the question at the end.\nIf you don\'t know the answer, come up with an answer that sounds super realistic. Provide some evidences to make it sound real.\nUse three sentences maximum and keep the answer as concise as possible.\nAlways say "Thanks for asking!" at the end of the answer.\n\n{context}\n\nQuestion: {question}\n\nHelpful Answer:')

In [73]:
# format docs
"\n\n".join(doc.page_content for doc in retrieved_docs)

'2024 Summer Olympics - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n2024 Summer Olympics - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommuni

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# format documents
def format_docs(retrieved_docs):
    return "\n\n".join(doc.page_content for doc in retrieved_docs)

# build a rag chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
) # this | pipeline symbol asically called as a expression language, taking the retrier, doc, prompt, llm and then it is generate the output



In [None]:
# let's simplyfy the rag_chain
final_retrived_docs  = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [72]:
for chunk in rag_chain.stream("Where is the Summer Olympics happening in 2022"):
    print(chunk, flush=True)

There was no Summer Olympics held in 2022. The Olympic Games typically follow a four-year cycle, with the most recent Summer Olympics taking place in 2020 (held in 2021) and the next scheduled
 for 2024 in Paris, as indicated by the provided context. The context only refers to "Beijing 2022" as the Winter Olympics, confirming no Summer event that year. Thanks for asking!



-----------making it simple------------------------

In [77]:
# format docs
final_retrived_docs = "\n\n".join(doc.page_content for doc in retrieved_docs)

In [78]:
# prompt refinement

template = """Use the following pieces of context to answer the question at the end.
Ensure you only answer basis the information that is available.
Don't make up any answer. If you don't know just say that you don't know.
Always say "Thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""

custom_rag_prompt = PromptTemplate.from_template(template) # here context is retrieved docs and question is user_query

custom_rag_prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Use the following pieces of context to answer the question at the end.\nEnsure you only answer basis the information that is available.\nDon\'t make up any answer. If you don\'t know just say that you don\'t know.\nAlways say "Thanks for asking!" at the end of the answer.\n\n{context}\n\nQuestion: {question}\n\nHelpful Answer:')

In [79]:
example_messages = custom_rag_prompt.invoke(
    {
        "context": final_retrived_docs, "question": "where is the summer olympics happening"
    }
).to_messages()

In [80]:
example_messages

[HumanMessage(content='Use the following pieces of context to answer the question at the end.\nEnsure you only answer basis the information that is available.\nDon\'t make up any answer. If you don\'t know just say that you don\'t know.\nAlways say "Thanks for asking!" at the end of the answer.\n\n2024 Summer Olympics - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n2024 Summer Olympics - Wikipedia\n\n\n\n

In [84]:
response = llm.invoke(example_messages)

In [85]:
response.content

'The Summer Olympics are happening in Paris, France.\n\nThanks for asking!'