In [1]:
import os                     
import time                  
from datetime import datetime, timezone, timedelta 
import hashlib               
import warnings            
import re                     

import feedparser            
import requests              
from bs4 import BeautifulSoup   

from langchain_core.runnables import RunnablePassthrough 
from langchain_core.output_parsers import StrOutputParser 
from langchain.prompts import PromptTemplate           
from langchain_community.docstore.document import Document 

from langchain_chroma import Chroma               
from langchain_huggingface import HuggingFaceEmbeddings

from langchain_google_genai import ChatGoogleGenerativeAI 

import chromadb            
from chromadb.utils import embedding_functions

from dotenv import load_dotenv

load_dotenv()
print(".env file with LLM API Key loaded")

.env file with LLM API Key loaded


In [2]:
RSS_FEEDS = [
    "https://timesofindia.indiatimes.com/rssfeedstopstories.cms", # Top Stories
    "https://timesofindia.indiatimes.com/rssfeedmostrecent.cms",  # Most Recent Stories 
    "https://timesofindia.indiatimes.com/rssfeeds/-2128936835.cms",   # India
    "https://timesofindia.indiatimes.com/rssfeeds/296589292.cms",    # World
    "https://timesofindia.indiatimes.com/rssfeeds/1898055.cms",    # Business
    "https://timesofindia.indiatimes.com/rssfeeds/66949542.cms",    # Tech
    "https://timesofindia.indiatimes.com/rssfeeds/4719148.cms",    # Sports
]

print(f"Monitoring {len(RSS_FEEDS)} Times of India RSS feeds.")

EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
print(f"Using embedding model: {EMBEDDING_MODEL_NAME}")

VECTOR_DB_PATH = "./chroma_db_toi_scraped" 
COLLECTION_NAME = "toi_scraped_articles"
print(f"Vector database path: {VECTOR_DB_PATH}")
print(f"Vector database collection name: {COLLECTION_NAME}")

#LLM (Gemini) 
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY not found in .env file. Please create a .env file with your key.")
else:
    print("Google API Key loaded successfully.") 

LLM_MODEL_NAME = "gemini-2.0-flash"
#LLM_MODEL_NAME = "gemini-1.5-pro"
print(f"Using LLM model: {LLM_MODEL_NAME}") 

ARTICLE_FETCH_LIMIT = 10 
SCRAPE_TIMEOUT = 15      
SCRAPE_DELAY = 2       

SIMILARITY_TOP_K = 4
print(f"Retrieving top {SIMILARITY_TOP_K} documents for queries.") 

REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
print(f"Using User-Agent for scraping: {REQUEST_HEADERS['User-Agent']}")

Monitoring 7 Times of India RSS feeds.
Using embedding model: all-MiniLM-L6-v2
Vector database path: ./chroma_db_toi_scraped
Vector database collection name: toi_scraped_articles
Google API Key loaded successfully.
Using LLM model: gemini-2.0-flash
Retrieving top 4 documents for queries.
Using User-Agent for scraping: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36


In [3]:
#used to create unique ids for chromadb
def get_article_hash(article_link):
    return hashlib.sha256(article_link.encode()).hexdigest()

#formats retrieved docs into a single string for LLM prompt
def format_docs(docs):
    formatted = []
    for i, doc in enumerate(docs):
        metadata = doc.metadata
        source_info = f"[Source: {metadata.get('title', 'N/A')[:60]}... ({metadata.get('published_utc', 'N/A')}) Link: {metadata.get('link', 'N/A')}]"
        content_preview = doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content
        formatted.append(f"Snippet {i+1}:\n{content_preview}\n{source_info}")
    return "\n\n".join(formatted)

#global var to not add duplicate docs
processed_article_ids_session = set()

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
print("Embedding model initialized.")

chroma_client = chromadb.PersistentClient(path=VECTOR_DB_PATH)
chroma_embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBEDDING_MODEL_NAME
)
toi_collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=chroma_embedding_function,
    metadata={"hnsw:space": "cosine"}
)
print(f"ChromaDB collection '{COLLECTION_NAME}' loaded/created. Initial count: {toi_collection.count()}")

vectorstore = Chroma(
    client=chroma_client,
    collection_name=COLLECTION_NAME,
    embedding_function=embedding_model,
    persist_directory=VECTOR_DB_PATH
)
print("LangChain vector store interface initialized.")

llm = ChatGoogleGenerativeAI(
    model=LLM_MODEL_NAME,
    google_api_key=GOOGLE_API_KEY,
    temperature=0.3
)
print("Google Gemini LLM initialized.")

retriever = vectorstore.as_retriever(search_kwargs={'k': SIMILARITY_TOP_K})
print(f"Retriever initialized to fetch top {SIMILARITY_TOP_K} documents.")

template = """
You are an AI assistant answering questions about recent news events based ONLY on the provided article content.
Analyze the detailed article content provided in the 'Context' section below.
Answer the 'Question' using only information found in the 'Context'.
Do not add information that is not present in the provided text.
If the answer cannot be found in the context, state clearly "Based on the provided articles, I cannot answer this question."
Be factual and summarize relevant points concisely.

Context:
{context}

Question: {question}

Answer:
"""
prompt = PromptTemplate.from_template(template)
print("Prompt template created.")

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
print("RAG chain constructed.")

Embedding model initialized.
ChromaDB collection 'toi_scraped_articles' loaded/created. Initial count: 56
LangChain vector store interface initialized.
Google Gemini LLM initialized.
Retriever initialized to fetch top 4 documents.
Prompt template created.
RAG chain constructed.


In [5]:
def scrape_toi_article(url):
    try:
        print(f"  Scraping URL: {url}")
        response = requests.get(url, headers=REQUEST_HEADERS, timeout=SCRAPE_TIMEOUT)
        response.raise_for_status()

        time.sleep(SCRAPE_DELAY)

        soup = BeautifulSoup(response.text, 'html.parser')

        article_div = soup.find('div', class_=lambda x: x and '_s30J' in x) # Articles are ususally found in a div called '_s30J clearfix  ' for TOI

        if article_div:
            article_text = article_div.get_text(separator=" ", strip=True)

            article_text = re.sub(r'\s{2,}', ' ', article_text).strip()

            if article_text: 
                 print(f"  Successfully scraped ~{len(article_text)} characters.")
                 return article_text
            else:
                 print(f"  WARNING: Found article div for {url}, but no text extracted.")
                 return None
        else:
            print(f"  WARNING: Could not find main article content container for {url}.")
            return None

    except requests.exceptions.Timeout:
        print(f"  ERROR: Timeout occurred while scraping {url}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"  ERROR: Failed to fetch/scrape {url}: {e}")
        return None
    except Exception as e:
        print(f"  ERROR: An unexpected error occurred during scraping {url}: {e}")
        return None


def fetch_process_scrape_feed(feed_url, collection):
    articles_added_count = 0
    print(f"\nProcessing feed: {feed_url}")
    try:
        feed = feedparser.parse(feed_url)
        if feed.bozo:
             print(f"WARNING: Feed potentially malformed: {feed_url} - Error: {feed.bozo_exception}")
    except Exception as e:
        print(f"ERROR: Error fetching or parsing feed {feed_url}: {e}")
        return 0

    for entry in feed.entries[:ARTICLE_FETCH_LIMIT]:
        try:
            article_link = entry.get("link")
            if not article_link:
                print(f"  Skipping entry without link: {entry.get('title')}")
                continue

            entry_hash = get_article_hash(article_link)

            if entry_hash in processed_article_ids_session: #this checks dupes for this session
                continue
            existing = collection.get(ids=[entry_hash], limit=1) 
            if existing and len(existing.get('ids', [])) > 0:  #this checks dupes in existing db
                processed_article_ids_session.add(entry_hash)
                continue

            scraped_content = scrape_toi_article(article_link)

            if not scraped_content:
                print(f"  Skipping article due to scraping failure: {entry.get('title')}")
                continue

            title = entry.get("title", "No Title Provided")
            published_time_struct = entry.get("pubDate")
            published_ts_iso = None
            now_utc_iso = datetime.now(timezone.utc).isoformat()
            if published_time_struct:
                try:
                    dt_obj = datetime.fromtimestamp(time.mktime(published_time_struct), tz=timezone.utc)
                    published_ts_iso = dt_obj.isoformat()
                except Exception:
                     published_ts_iso = now_utc_iso
            else:
                 published_ts_iso = now_utc_iso

            metadata = {
                "source_feed": feed_url, "title": title, "link": article_link,
                "published_utc": published_ts_iso,
                "accessed_utc": now_utc_iso
            }

            collection.add(
                documents=[scraped_content], 
                metadatas=[metadata],
                ids=[entry_hash]
            )
            processed_article_ids_session.add(entry_hash)
            articles_added_count += 1
            print(f"  Successfully Added (Scraped): {title[:60]}... (ID: {entry_hash[:8]})")

        except Exception as e:
            print(f"ERROR processing entry '{entry.get('title', 'N/A')}': {e}")
            continue

    return articles_added_count


def run_ingestion_cycle(feeds_to_process, collection):
    total_added_this_cycle = 0
    cycle_start_time = time.time()
    print("\n--- Starting Ingestion Cycle (with Scraping) ---")

    for feed_url in feeds_to_process:
        total_added_this_cycle += fetch_process_scrape_feed(feed_url, collection)

    cycle_end_time = time.time()
    print("\n--- Ingestion Cycle Complete ---")
    print(f"Total new articles scraped & added: {total_added_this_cycle}")
    print(f"Current document count in collection: {collection.count()}")
    print(f"Cycle duration: {cycle_end_time - cycle_start_time:.2f} seconds")

print("Running a single ingestion cycle to scrape and update the database...")
run_ingestion_cycle(RSS_FEEDS, toi_collection)
print("Single ingestion cycle finished.")

Running a single ingestion cycle to scrape and update the database...

--- Starting Ingestion Cycle (with Scraping) ---

Processing feed: https://timesofindia.indiatimes.com/rssfeedstopstories.cms
  Scraping URL: https://timesofindia.indiatimes.com/india/rajnath-singh-speaks-to-us-counterpart-pete-hegseth-amid-rising-tensions-with-pakistan-after-pahalgam-attack/articleshow/120794248.cms
  Successfully scraped ~3226 characters.
  Successfully Added (Scraped): 'US backs India's right to defend itself': Defence minister ... (ID: 7826b24a)
  Scraping URL: https://timesofindia.indiatimes.com/india/chun-chun-ke-jawab-milega-amit-shah-vows-retaliation-after-pahalgam-terror-attack/articleshow/120795407.cms
  Successfully scraped ~2047 characters.
  Successfully Added (Scraped): 'Chun chun ke jawab milega': Shah vows to avenge Pahalgam at... (ID: a91eb32b)
  Scraping URL: https://timesofindia.indiatimes.com/sports/cricket/ipl/top-stories/you-didnt-pay-rs-10-crore-for-r-ashwin-to-harbhajan-singh

In [7]:
def ask_question(query):
    if not query or not query.strip():
        print("Please enter a valid question.")
        return

    print(f"\nQuery: {query}")
    print("Thinking...")
    try:
        start_time = time.time()
        answer = rag_chain.invoke(query)
        end_time = time.time()

        print("\nAnswer:")
        print(answer)
        print(f"\n(Response generated in {end_time - start_time:.2f} seconds)")

    except Exception as e:
        print(f"\nERROR: An error occurred while processing your query: {e}")

ask_question("What are the latest updates on the Indian economy?")
ask_question("Summarize recent developments in technology reported by Times of India.")
ask_question("Any major political news from Delhi reported recently?")
ask_question("What happened in the latest cricket match covered by TOI?")


Query: What are the latest updates on the Indian economy?
Thinking...

Answer:
Here are the latest updates on the Indian economy based on the provided articles:

*   Foreign investors invested Rs 17,425 crore into India’s equity markets between April 21 and April 25, following a net investment of Rs 8,500 crore in the previous week.
*   The BSE Sensex increased nearly 4 per cent in April, driven by foreign investment inflows, monsoon rainfall forecasts, and positive prospects of an India-US trade agreement.
*   The combined market valuation of six of the top-10 most valued Indian companies surged by Rs 1,18,626.24 crore last week, with Tata Consultancy Services (TCS) leading the gains.
*   India is preparing to request relaxed export controls and enhanced access to advanced technologies from the United States under the proposed bilateral trade agreement (BTA).

(Response generated in 3.45 seconds)

Query: Summarize recent developments in technology reported by Times of India.
Thinking

In [11]:
print("\n--- Interactive Query Mode ---")
print("Type 'quit' to exit.")
while True:
    user_query = input("\nEnter your question: ")
    if user_query.lower() == 'quit':
        break
    ask_question(user_query)
print("Exiting interactive mode.")


--- Interactive Query Mode ---
Type 'quit' to exit.



Enter your question:  quit


Exiting interactive mode.
