In [12]:
import os
os.environ['GEMINI_API_KEY'] = 'AIzaSyBYMNWbXATdrV7BAZSbjIVmrT7H4iNS0qQ'

In [13]:
# ----------------- API KEY SAFETY CHECK -----------------
# IMPORTANT: rotate the key you pasted earlier and stop hardcoding keys.
import sys
_gemini_key = os.environ.get("GEMINI_API_KEY")
if not _gemini_key:
    raise RuntimeError("GEMINI_API_KEY not set - set it as an environment variable instead of hardcoding.")
# If you suspect the key was leaked, stop now and rotate it at the provider console.
# Note: Do NOT paste the key into public channels.
# ------------------------------------------------------


In [14]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from google import genai
from google.genai import types
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [15]:
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
# Replace prints with logger.info / logger.warning later (I point those out below).


In [16]:
client = genai.Client()

resp = client.models.generate_content(
    model="gemini-2.5-flash",
    contents="Explain how AI works in a few words",
    config=types.GenerateContentConfig(
        temperature=0.0,
        max_output_tokens=500,
    ),
)

2025-08-26 00:05:39,682 INFO: AFC is enabled with max remote calls: 10.
2025-08-26 00:05:44,197 INFO: HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent "HTTP/1.1 200 OK"
2025-08-26 00:05:44,199 INFO: AFC remote call 1 is done.


In [17]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [19]:
len(docs)

18

In [20]:
docs[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹50 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nNetwork 18\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nMoneycontrol\n\nGo PRO NowPRO\n\nMoneycontrol PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending Topic

In [21]:
if 'GOOGLE_API_KEY' not in os.environ:
    os.environ['GOOGLE_API_KEY'] = os.environ.get('GEMINI_API_KEY', '')

from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI


In [22]:
embeddings_gemini = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# Build FAISS index from your already-created docs
vectorindex_gemini = FAISS.from_documents(docs, embeddings_gemini)

2025-08-26 00:05:49,374 INFO: Loading faiss with AVX2 support.
2025-08-26 00:05:49,440 INFO: Successfully loaded faiss with AVX2 support.


In [23]:
index_dir = "faiss_gemini_index"
vectorindex_gemini.save_local(index_dir) 

In [24]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain

# directory where we save the faiss index
index_dir = "faiss_gemini_index"

# create embeddings (must be the same type used to build the index)
embeddings_gemini = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# If index folder doesn't exist, build and save it (one-time)
if not os.path.exists(index_dir) or not os.listdir(index_dir):
    logger.info("Index not found on disk. Building FAISS index from docs and saving to disk...")
    vectorindex_gemini = FAISS.from_documents(docs, embeddings_gemini)
    vectorindex_gemini.save_local(index_dir)
    print("Index built and saved to", index_dir)
else:
    print("Index folder found. Skipping index build.")

# -- SAFE LOADING: only enable this if you trust the files in index_dir --
try:
    vectorIndex_gemini = FAISS.load_local(
        index_dir,
        embeddings_gemini,
        allow_dangerous_deserialization=True  # set True because we created the files locally
    )
    print("FAISS index loaded OK.")
except ValueError as e:
    print("Error loading FAISS index:", e)
    raise

# create Gemini LLM wrapper and chain
llm_gemini = GoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.0)
retriever = vectorIndex_gemini.as_retriever(search_kwargs={"k": 4})
chain_gemini = RetrievalQAWithSourcesChain.from_llm(llm=llm_gemini, retriever=retriever)

query = "what is the price of Tiago iCNG?"
langchain.debug = True
result = chain_gemini.invoke({"question": query})

# ---------- ADD THIS PART (normalizing sources) ----------
def normalize_sources(sources_str: str):
    """Split messy sources into clean list of unique links"""
    if not sources_str:
        return []
    # split by spaces or commas, remove duplicates, keep only non-empty parts
    parts = [s.strip() for s in sources_str.replace("\n", " ").split(" ") if s.strip()]
    unique_links = []
    for p in parts:
        if p not in unique_links:
            unique_links.append(p)
    return unique_links

# clean sources
clean_sources = normalize_sources(result.get("sources", ""))

# final nicely formatted output
print("Answer:", result.get("answer", ""))
print("Sources:")
for src in clean_sources:
    print(" -", src)
# --------------------------------------------------------



Index folder found. Skipping index build.
FAISS index loaded OK.
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Set Alert\n\nlive\n\nbselive\n\nnselive\n\nVolume\n\nTodays L/H\n\nMore\n\nTata Motors on Friday launched the CNG variant of its micro SUV Punch priced between Rs 7.1 lakh and Rs 9.68 lakh (ex-showroom, Delhi).\n\nThe Punch iCNG is equipped with the company's proprietary twin-cylinder technology with enhanced safety features like a micro-switch to keep the car switched off at the time of refuelling and thermal incident protect