In [1]:
# Run this cell and paste the API key in the prompt
import os
import getpass

os.environ['GOOGLE_API_KEY'] = getpass.getpass('Gemini API Key:')

Gemini API Key: ········


In [2]:
from langchain import PromptTemplate
from langchain import hub
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import Chroma

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
import re
from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader
from langchain.docstore.document import Document

In [4]:
# 5. Load the article from web
loader = WebBaseLoader("https://economictimes.indiatimes.com/markets/stocks/news/rs-26-lakh-crore-richer-in-a-week-what-should-stock-market-investors-do-now/articleshow/121244746.cms")
docs = loader.load()

In [5]:
# Before Preprocessing 
docs[0].page_content[:1000]

"Sensex Nifty Outlook: Rs 26 lakh crore richer in a week! What should stock market investors do now? - The Economic Times  Benchmarks Nifty25,019.80-42.3Precious Metal Gold (MCX) (Rs/10g.)92,480.00-689.0Enter search text:English EditionEnglish Editionहिन्दीગુજરાતીमराठीবাংলাಕನ್ನಡമലയാളംதமிழ்తెలుగు | Today's ePaper\n            \t\t\t        My Watchlist\n                            SubscribeSign InHomeETPrimeMarketsMarket DataAI Masterclass NewsIndustrySMEPoliticsWealthMFTechAICareersOpinionNRIPanacheMore MenuStocksNewsLive BlogStock Live BlogEarningsPodcastMarket ClassroomDons of Dalal StreetRecosStock Reports PlusNewMy ScreenerCandlestick ScreenerStock ScreenerStock WatchMarket CalendarStock Price QuotesOptionsIPOs/FPOsExpert ViewsInvestment IdeasCommoditiesViewsNewsOthersMentha OilPrecious MetalsGold MGoldSilverGold PetalSilver MicroSilver MGold GuineaOil & EnergyNatural GasCrude OilCrude Oil MiniBase MetalsAluminiumZinc MiniLead MiniCopperZincNickelAluminium MiniLeadPlantationKapasCo

In [None]:
# List of non-English language labels to remove1154
NON_ENGLISH_LABELS = [
    'हिन्दी', 'ગુજરાતી', 'मराठी', 'বাংলা',
    'ಕನ್ನಡ', 'മലയാളം', 'தமிழ்', 'తెలుగు',
    'اردو', 'ଓଡ଼ିଆ', 'ਪੰਜਾਬੀ'
]

# 1. Remove non-English language menu items
def remove_non_english_language_menu(text):
    pattern = r'(' + '|'.join(re.escape(lang) for lang in NON_ENGLISH_LABELS) + r')(\s*\|\s*)?'
    return re.sub(pattern, '', text)

# 2. Remove UI-style date junk like "PM | 16 May"
def remove_ui_date_patterns(text):
    return re.sub(r'\b(AM|PM)\s*\|\s*\d{1,2}\s+\w+\b', '', text)

# 3. Fix spacing issues like "PlusNewMy" → "Plus New My"
def fix_missing_spaces(text):
    # Add space between lowercase-uppercase and digit-uppercase
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    text = re.sub(r'(?<=[0-9])(?=[A-Z])', ' ', text)
    # Normalize extra spaces
    return re.sub(r'\s{2,}', ' ', text).strip()

# 4. Main preprocessing function
def preprocess_text(raw_html):
    # a. Remove HTML tags
    soup = BeautifulSoup(raw_html, "html.parser")
    text = soup.get_text(separator="\n")

    # b. Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # c. Remove unwanted menu and patterns
    text = remove_non_english_language_menu(text)
    text = remove_ui_date_patterns(text)
    text = fix_missing_spaces(text)

    return text.strip()

# 5. Extract raw text from the first document
raw_text = docs[0].page_content

# 6. Preprocess the text
final_text = preprocess_text(raw_text)

# 7. Convert to LangChain Document format
final_docs = [Document(page_content=final_text, metadata={"source": "local"})]


BeautifulSoup in This Project?
|  Purpose                         |  How It Helps                                             |
| ---------------------------------- | ----------------------------------------------------------- |
| Extract clean data from websites   | Turns messy HTML into usable context for your QA system     |
| Feed real-world documents into RAG | Enables dynamic context generation (live news, blogs, etc.) |
| Automate content ingestion         | Ideal for scraping multiple pages as context for answering  |


In [7]:
# 8. After Preprocessing Preview result
print(final_docs[0].page_content[:1000])  # print only first 1000 characters

Sensex Nifty Outlook: Rs 26 lakh crore richer in a week! What should stock market investors do now? - The Economic Times Benchmarks Nifty25,019.80-42.3 Precious Metal Gold (MCX) (Rs/10g.)92,480.00-689.0 Enter search text:English Edition English Edition Today's e Paper My Watchlist Subscribe Sign In Home ETPrime Markets Market Data AI Masterclass News Industry SMEPolitics Wealth MFTech AICareers Opinion NRIPanache More Menu Stocks News Live Blog Stock Live Blog Earnings Podcast Market Classroom Dons of Dalal Street Recos Stock Reports Plus New My Screener Candlestick Screener Stock Screener Stock Watch Market Calendar Stock Price Quotes Options IPOs/FPOs Expert Views Investment Ideas Commodities Views News Others Mentha Oil Precious Metals Gold MGold Silver Gold Petal Silver Micro Silver MGold Guinea Oil & Energy Natural Gas Crude Oil Crude Oil Mini Base Metals Aluminium Zinc Mini Lead Mini Copper Zinc Nickel Aluminium Mini Lead Plantation Kapas Cotton Candy Forex Forex News Currency Co

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [9]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [10]:
# Save to disk
vectorstore = Chroma.from_documents(
                     documents=final_docs,           # Data
                     embedding=gemini_embeddings,    # Embedding model
                     persist_directory="./chroma_db" # Directory to save data
                     )

In [15]:
# Load from disk
vectorstore_disk = Chroma(
                        persist_directory="./chroma_db",       # Directory of db
                        embedding_function=gemini_embeddings)   # Embedding model


In [16]:
retriever = vectorstore_disk.as_retriever(search_kwargs={"k": 1})

In [17]:
print(len(retriever.get_relevant_documents("MMLU")))

  print(len(retriever.get_relevant_documents("MMLU")))


1


In [13]:
from langchain_google_genai import ChatGoogleGenerativeAI

# If there is no environment variable set for the API key, you can pass the API
# key to the parameter `google_api_key` of the `ChatGoogleGenerativeAI` function:
# `google_api_key="key"`.
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",
                 temperature=0.7, top_p=0.85)
# Allowing moderate creativity (temperature=0.7)
# Letting the model sample from top 85% likely words, ignoring the rest (top_p=0.85)

In [23]:
# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} template="You are an assistant for question-answering tasks.\nUse the following context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse five sentences maximum and keep the answer concise.\n\nQuestion: {question} \nContext: {context} \nAnswer:"


In [24]:
# Combine data from documents to readable string format.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create stuff documents chain using LCEL.
#
# This is called a chain because you are chaining together different elements
# with the LLM. In the following example, to create the stuff chain, you will
# combine the relevant context from the website data matching the question, the
# LLM model, and the output parser together like a chain using LCEL.
#
# The chain implements the following pipeline:
# 1. Extract the website data relevant to the question from the Chroma
#    vector store and save it to the variable `context`.
# 2. `RunnablePassthrough` option to provide `question` when invoking
#    the chain.
# 3. The `context` and `question` are then passed to the prompt where they
#    are populated in the respective variables.
# 4. This prompt is then passed to the LLM (`gemini-pro`).
# 5. Output from the LLM is passed through an output parser
#    to structure the model's response.
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)

Model Testing Using Query

In [25]:
rag_chain.invoke("What is Gemini?")

'I am sorry, but this document does not contain any information about Gemini.  Therefore, I cannot answer your question.'

In [27]:
rag_chain.invoke("How much investor wealth was added in the Indian stock market in the past week?")

'Approximately Rs 26.5 lakh crore of investor wealth was added to the Indian stock market in the past week.  This surge was fueled by a ceasefire between India and Pakistan, low inflation, and strong foreign institutional investor (FII) flows.  The Nifty index is now close to its all-time high.  However, experts caution about valuation risks, particularly in mid- and small-cap stocks.'

In [28]:
rag_chain.invoke("What were the key reasons behind the recent stock market rally mentioned in the article?")

'The stock market rally was driven by a ceasefire between India and Pakistan, significantly reducing geopolitical tensions.  Low inflation, at a nearly six-year low, fueled hopes of a dovish RBI policy.  Strong foreign institutional investor (FII) inflows, totaling ₹15,925 crore, also contributed.  Broader gains across mid-cap and infrastructure sectors further boosted optimism. However, experts cautioned about valuation risks, particularly in mid- and small-cap stocks.'

In [29]:
rag_chain.invoke("What geopolitical event triggered optimism in the markets according to the article?")

"A surprise ceasefire between India and Pakistan significantly reduced geopolitical tensions, triggering optimism.  This was followed by positive economic data, including lower inflation and increased foreign investment.  Stronger-than-expected Q4 earnings also contributed to the market's positive response. The combination of these factors fueled a sharp rise in the Sensex and Nifty.  Experts, however, cautioned about valuation risks, particularly in mid- and small-cap stocks."

In [30]:
rag_chain.invoke("What was the inflation rate in April, and why is it significant?")

'In April, the inflation rate was 3.16%, the lowest in almost six years.  This is significant because it suggests a dovish stance from the Reserve Bank of India (RBI) in upcoming policy meetings.  Lower inflation generally boosts market optimism and economic growth. The low inflation rate contributed to a significant rise in the Sensex and Nifty.  This positive economic indicator fueled investor confidence.'

In [31]:
rag_chain.invoke("How much did the Sensex and Nifty rise over the week?")

'The Sensex rose 2,876 points, while the Nifty surged 4.2% over the week.  This resulted in a nearly Rs 26.5 lakh crore increase in investor wealth.  The article does not provide specific starting and ending values for either index.'

In [34]:
rag_chain.invoke("What is the RBI expected to do in its next policy decision, and why?")

'The RBI is expected to maintain an accommodative stance in its next policy decision due to low inflation (3.16%, the lowest in nearly six years).  This low inflation provides the RBI with room to keep interest rates low.  Experts believe this, combined with improving earnings and foreign investment, supports continued market momentum.  However, some caution valuation risks, particularly in mid- and small-cap stocks.'

RAGAS Evaluation

In [2]:
import ragas.metrics as metrics
print(dir(metrics))


['AgentGoalAccuracyWithReference', 'AgentGoalAccuracyWithoutReference', 'AnswerAccuracy', 'AnswerCorrectness', 'AnswerRelevancy', 'AnswerSimilarity', 'AspectCritic', 'BleuScore', 'ContextEntityRecall', 'ContextPrecision', 'ContextRecall', 'ContextRelevance', 'ContextUtilization', 'DataCompyScore', 'DistanceMeasure', 'ExactMatch', 'FactualCorrectness', 'Faithfulness', 'FaithfulnesswithHHEM', 'InstanceRubrics', 'LLMContextPrecisionWithReference', 'LLMContextPrecisionWithoutReference', 'LLMContextRecall', 'LLMSQLEquivalence', 'Metric', 'MetricOutputType', 'MetricType', 'MetricWithEmbeddings', 'MetricWithLLM', 'MultiModalFaithfulness', 'MultiModalRelevance', 'MultiTurnMetric', 'NoiseSensitivity', 'NonLLMContextPrecisionWithReference', 'NonLLMContextRecall', 'NonLLMStringSimilarity', 'ResponseGroundedness', 'ResponseRelevancy', 'RougeScore', 'RubricsScore', 'SemanticSimilarity', 'SimpleCriteriaScore', 'SingleTurnMetric', 'StringPresence', 'SummarizationScore', 'ToolCallAccuracy', 'TopicAdhe

In [6]:
from ragas.metrics import faithfulness, answer_correctness, context_precision, context_recall
from ragas import evaluate
from datasets import Dataset

In [7]:
# Step 1: Prepare Data
data = {
    "question": [
        "What were the primary drivers behind the ₹26 lakh crore surge in investor wealth over the past week?",
        "How did geopolitical developments, such as the ceasefire between India and Pakistan, influence the market rally?",
        "What role did the drop in April inflation to 3.16% play in boosting investor confidence?",
        "Which sectors showed the most growth during this rally, and why were midcap and small-cap stocks particularly affected?",
        "What levels should investors watch for Nifty's support and resistance, according to analysts?"
    ],
    
    "answer": [
        "The surge was driven by geopolitical relief (ceasefire), low inflation, strong FII inflows, and a breakout from a three-week range. This led to a ₹26.5 lakh crore increase in investor wealth as Sensex and Nifty soared.",
        "The surprise ceasefire between India and Pakistan eased geopolitical tensions, which significantly boosted investor sentiment and contributed to the strong market rally.",
        "The drop in April inflation to 3.16%, the lowest in six years, raised hopes of a dovish RBI stance, increasing investor confidence and fueling the rally.",
        "Defence, metals, NBFCs, and auto sectors led the rally. Midcap and small-cap stocks gained 7–9% due to broad market optimism, but concerns remain due to their high valuations.",
        "Analysts suggest resistance for Nifty at 25,200–25,600 levels and support at 24,800 and stronger at 24,400. A buy-on-dips strategy is recommended with sector rotation in mind."
    ],
    
    "contexts": [
        ["Markets soared as geopolitical relief, soft inflation, and strong FII flows lifted Sensex and Nifty sharply. Nearly ₹26.5 lakh crore in investor wealth was created in just one week."],
        ["A surprise ceasefire between India and Pakistan slashed geopolitical tension overnight, boosting market sentiment."],
        ["April inflation dropped to 3.16%, the lowest in nearly six years, giving RBI room to remain accommodative and improving investor outlook."],
        ["Midcap and small-cap indices jumped 7% and 9% respectively. Defence and infrastructure sectors led gains, although valuations are now stretched."],
        ["The Nifty broke out of a 3-week range. Analysts see resistance at 25,200–25,600, and support at 24,800 and 24,400. A buy-on-dips strategy is advised."]
    ],
    
    "ground_truth": [
        "Investor wealth surged ₹26.5 lakh crore due to factors like geopolitical relief (ceasefire), low inflation (3.16%), strong foreign inflows (₹15,925 crore), and broader sector participation including midcaps and small-caps.",
        "The ceasefire announcement between India and Pakistan significantly reduced geopolitical risks, which contributed to improved market sentiment and was one of the main triggers for the rally.",
        "The inflation drop to 3.16% raised expectations of a dovish RBI policy, boosting market optimism and allowing continued foreign investments.",
        "Sectors such as defence, metals, NBFCs, and auto performed strongly. Midcap and small-cap stocks rose due to broader market participation and FOMO, although valuation risks remain high.",
        "Nifty's resistance levels are around 25,200–25,600, and support levels are 24,800 and 24,400. Analysts recommend a buy-on-dips strategy and caution on overvalued segments."
    ]
}

# Step 2: Create HuggingFace Dataset
dataset = Dataset.from_dict(data)

# Step 3: Evaluate with RAGAS
results = evaluate(
    dataset,
    metrics=[faithfulness, answer_correctness, context_precision, context_recall]
)

# Step 4: Display Results
print(results)

Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}})
Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}})
Exception raised in Job[6]: RateLimitError(Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'in

{'faithfulness': nan, 'answer_correctness': nan, 'context_precision': nan, 'context_recall': nan}


In [None]:
# Step 1: Prepare Data
data = {
    "question": [
        "What were the primary drivers behind the ₹26 lakh crore surge in investor wealth over the past week?",
        "How did geopolitical developments, such as the ceasefire between India and Pakistan, influence the market rally?",
        "What role did the drop in April inflation to 3.16% play in boosting investor confidence?",
        "Which sectors showed the most growth during this rally, and why were midcap and small-cap stocks particularly affected?",
        "What levels should investors watch for Nifty's support and resistance, according to analysts?"
    ],
    
    "answer": [
        "The surge was driven by geopolitical relief (ceasefire), low inflation, strong FII inflows, and a breakout from a three-week range. This led to a ₹26.5 lakh crore increase in investor wealth as Sensex and Nifty soared.",
        "The surprise ceasefire between India and Pakistan eased geopolitical tensions, which significantly boosted investor sentiment and contributed to the strong market rally.",
        "The drop in April inflation to 3.16%, the lowest in six years, raised hopes of a dovish RBI stance, increasing investor confidence and fueling the rally.",
        "Defence, metals, NBFCs, and auto sectors led the rally. Midcap and small-cap stocks gained 7–9% due to broad market optimism, but concerns remain due to their high valuations.",
        "Analysts suggest resistance for Nifty at 25,200–25,600 levels and support at 24,800 and stronger at 24,400. A buy-on-dips strategy is recommended with sector rotation in mind."
    ],
    
    "contexts": [
        ["Markets soared as geopolitical relief, soft inflation, and strong FII flows lifted Sensex and Nifty sharply. Nearly ₹26.5 lakh crore in investor wealth was created in just one week."],
        ["A surprise ceasefire between India and Pakistan slashed geopolitical tension overnight, boosting market sentiment."],
        ["April inflation dropped to 3.16%, the lowest in nearly six years, giving RBI room to remain accommodative and improving investor outlook."],
        ["Midcap and small-cap indices jumped 7% and 9% respectively. Defence and infrastructure sectors led gains, although valuations are now stretched."],
        ["The Nifty broke out of a 3-week range. Analysts see resistance at 25,200–25,600, and support at 24,800 and 24,400. A buy-on-dips strategy is advised."]
    ],
    
    "ground_truth": [
        "Investor wealth surged ₹26.5 lakh crore due to factors like geopolitical relief (ceasefire), low inflation (3.16%), strong foreign inflows (₹15,925 crore), and broader sector participation including midcaps and small-caps.",
        "The ceasefire announcement between India and Pakistan significantly reduced geopolitical risks, which contributed to improved market sentiment and was one of the main triggers for the rally.",
        "The inflation drop to 3.16% raised expectations of a dovish RBI policy, boosting market optimism and allowing continued foreign investments.",
        "Sectors such as defence, metals, NBFCs, and auto performed strongly. Midcap and small-cap stocks rose due to broader market participation and FOMO, although valuation risks remain high.",
        "Nifty's resistance levels are around 25,200–25,600, and support levels are 24,800 and 24,400. Analysts recommend a buy-on-dips strategy and caution on overvalued segments."
    ]
}

 RAG Evaluation Using Sentence Transformers |  RAG Evaluation Without OpenAI or Ragas

In [10]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from datasets import Dataset

In [11]:


# Load your dataset (use the same data as above)
df = pd.DataFrame(data)

# Load a local embedding model (free!)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Helper to get average cosine similarity
def compute_similarity(a, b):
    emb1 = model.encode(a, convert_to_tensor=True)
    emb2 = model.encode(b, convert_to_tensor=True)
    return float(util.cos_sim(emb1, emb2)[0][0])

# Evaluation functions
def evaluate_row(row):
    result = {}
    
    # Faithfulness ≈ answer vs context
    result["faithfulness"] = compute_similarity(row["answer"], " ".join(row["contexts"]))

    # Relevance ≈ answer vs question
    result["relevance"] = compute_similarity(row["answer"], row["question"])

    # Ground truth correctness ≈ answer vs ground truth
    result["answer_correctness"] = compute_similarity(row["answer"], row["ground_truth"])
    
    return pd.Series(result)

# Apply to each row
df_scores = df.apply(evaluate_row, axis=1)

# Combine with original
final_df = pd.concat([df, df_scores], axis=1)

# Show results
print(final_df[["question", "faithfulness", "relevance", "answer_correctness"]])


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


                                            question  faithfulness  relevance  \
0  What were the primary drivers behind the ₹26 l...      0.842021   0.742444   
1  How did geopolitical developments, such as the...      0.820882   0.751923   
2  What role did the drop in April inflation to 3...      0.832039   0.789569   
3  Which sectors showed the most growth during th...      0.657232   0.790945   
4  What levels should investors watch for Nifty's...      0.846471   0.707772   

   answer_correctness  
0            0.768280  
1            0.826204  
2            0.814497  
3            0.724177  
4            0.874823  
