### Retrival and Streamlit app


In [1]:
# pip install langchain sentence-transformers faiss-cpu unstructured


In [2]:
import os
import pickle
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader
from dotenv import load_dotenv

# 🔹 Load environment variables (if needed)
load_dotenv()
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# 1️⃣ Load documents from web pages
urls = [
    # "https://finance.yahoo.com/video/why-deepseek-ai-evolution-not-194500651.html",
    # "https://finance.yahoo.com/personal-finance/banking/article/typical-budget-for-100000-salary-225021694.html"
    "https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html",
    "https://www.moneycontrol.com/news/business/markets/market-corrects-post-rbi-ups-inflation-forecast-icrr-bet-on-these-top-10-rate-sensitive-stocks-ideas-11142611.html"
]
loader = WebBaseLoader(urls)
docs = loader.load()

# 2️⃣ Split documents into smaller chunks (improves retrieval)
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=60)
split_docs = splitter.split_documents(docs)
len(split_docs)


43

In [10]:
split_docs[4]

Document(page_content='BuzzMC FeaturesMega Options FestivalMC LearnTECHNOLOGYPersonal TechAutoFintechMEDIAPodcastPhotosVideosWeb StoriesCRYPTOCURRENCYOTHERSEntertainmentSportsLifestyleHealth and FitnessEducationJobsScienceAstroReligionTravelBudgetBudget Top StoriesMarket ActionTax CalculatorTop GainersPersonal FinanceOpinionSectors in SpotlightPitch ReportRenewable EnergyEnergy Infra Markets Real estateInsurance Banking \xa0Consumer Durables InfrastructureDefenceAgricultureAuto Jewellery Market NewsBudget Analysis ToolsTeam India BudgetHistory Of BudgetFAQBudget Breakdown: Earnings & Expenses (*BE 2024-25)State Of The', metadata={'source': 'https://www.moneycontrol.com/news/business/banks/hdfc-bank-re-appoints-sanmoy-chakrabarti-as-chief-risk-officer-11259771.html', 'title': 'HDFC Bank re-appoints Sanmoy Chakrabarti as Chief Risk Officer', 'description': 'Chakrabarti has been appointed for a period of five years from December 14, 2023 to December 13, 2028.', 'language': 'en'})

In [4]:
# 3️⃣ Use an open-source embedding model (NO OpenAI)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 4️⃣ Build FAISS vector index (local, no API calls)
vector_index = FAISS.from_documents(split_docs, embedding_model)


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# 5️⃣ Save vector index to disk (so you don't recompute every time)
with open("vector_index.pkl", "wb") as f:
    pickle.dump(vector_index, f)

In [8]:

# # 6️⃣ Load the vector index when needed
# with open("vector_index.pkl", "rb") as f:
#     vector_index = pickle.load(f)

# 7️⃣ Set up a local language model (LLM) to answer questions
llm = HuggingFaceHub(repo_id="google/flan-t5-small", model_kwargs={"temperature": 0.5})

# 8️⃣ Create the QA Chain
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vector_index.as_retriever())





In [9]:
# 9️⃣ Ask a question
query = "what is the price of Tiago iCNG?"
response = chain({"question": query}, return_only_outputs=True)

print("\n🔍 Answer:")
print(response)

Token indices sequence length is longer than the specified maximum sequence length for this model (1782 > 1024). Running this sequence through the model will result in indexing errors



🔍 Answer:
{'answer': 'iCNG', 'sources': ''}
