In [1]:
import os
import asyncio
import nest_asyncio
from dotenv import load_dotenv  # <-- ADD THIS LINE

# LangChain Community for advanced loaders and vector stores
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.vectorstores import FAISS
from langchain_community.document_transformers import Html2TextTransformer

# LangChain OpenAI for Azure/OpenAI specific integrations
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings

# Core LangChain components
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

USER_AGENT environment variable not set, consider setting it to identify your requests.


Initialize LLM and Embeddings

In [2]:
# Load environment variables from your custom file
load_dotenv(dotenv_path="OpenAI_APIkey.env")

# --- Initialize the LLM ---
# The Azure clients will automatically find the environment variables
llm = AzureChatOpenAI(
    deployment_name="gpt-4.1-mini-2",
    model_name="gpt-4",
    temperature=0.9,
    max_tokens=500
)

# --- Initialize the Embeddings ---
embeddings = AzureOpenAIEmbeddings(
    deployment="text-embedding-ada-002",
    chunk_size=1
)

print("LLM and Embeddings initialized successfully.")

LLM and Embeddings initialized successfully.


Load, Transform, and Split Data

In [3]:
# List of URLs to scrape
urls = [
    "https://www.moneycontrol.com/news/business/economy/net-direct-tax-revenue-jumps-6-3-to-rs-11-89-lakh-crore-till-oct-12-13613684.html",
    "https://www.moneycontrol.com/news/economy/policy/inflation-hits-over-eight-year-low-of-1-54-in-september-but-non-vegetarians-and-gold-buyers-feel-the-pinch-13613394.html"
]

# Use AsyncHtmlLoader which is compatible with asyncio environments
loader = AsyncHtmlLoader(urls)

# Apply nest_asyncio to allow running asyncio loops within another loop
nest_asyncio.apply()

# Define an async function to load the data
async def load_docs():
    return await loader.aload()

# Run the async function to get the documents
all_documents = asyncio.run(load_docs())

# Convert HTML to plain text
html2text = Html2TextTransformer()
all_documents = list(html2text.transform_documents(all_documents))

# Split the data into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
docs = text_splitter.split_documents(all_documents)

if not docs:
    raise ValueError("No documents were loaded from the URLs.")

print(f"Loaded and split into {len(docs)} chunks from {len(urls)} URLs.")

Fetching pages: 100%|##########| 2/2 [00:00<00:00,  3.16it/s]


Loaded and split into 40 chunks from 2 URLs.


Create and Save FAISS Vector Index

In [4]:
vector_index = FAISS.from_documents(docs, embeddings)
folder_path = "faiss_index"
vector_index.save_local(folder_path)
print(f"FAISS index saved to '{folder_path}'.")


FAISS index saved to 'faiss_index'.


Load Index, Create Chain

In [5]:
# --- Load the FAISS index from local storage ---
vector_index_loaded = FAISS.load_local(
    folder_path,
    embeddings,
    allow_dangerous_deserialization=True
)

# --- Create the RetrievalQA chain ---
# We use RetrievalQA and set `return_source_documents=True` to ensure we get the source info.
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_index_loaded.as_retriever(),
    return_source_documents=True  # This is the key change
)

print("Chain is ready.")

Chain is ready.


Ask a Question 

In [6]:
# --- Execute the query ---
query = "Net direct tax revenue jumped ?"
print(f"\nExecuting query: '{query}'")

result = chain.invoke({"query": query})

# --- Print the final answer and sources ---
print("\n" + "="*20 + " RESULT " + "="*20)
print("\n--- Answer ---")
# The answer is now in the 'result' key
print(result.get('result', 'No answer found.').strip())

print("\n--- Sources ---")
# Extract unique sources from the 'source_documents' metadata
if 'source_documents' in result and result['source_documents']:
    # Use a set to automatically handle duplicate URLs
    unique_sources = {doc.metadata['source'] for doc in result['source_documents']}
    for source in unique_sources:
        print(source)
else:
    print("No sources found.")

print("\n" + "="*48)


Executing query: 'Net direct tax revenue jumped ?'


--- Answer ---
Net direct tax revenue jumped 6.33% to over Rs 11.89 lakh crore till October 12 in the current fiscal year.

--- Sources ---
https://www.moneycontrol.com/news/business/economy/net-direct-tax-revenue-jumps-6-3-to-rs-11-89-lakh-crore-till-oct-12-13613684.html

