In [3]:
# Multilingual RSS News Processor with Advanced User Query Support
# For Google Colab

# ======== SETUP AND DEPENDENCIES ========
!pip install feedparser transformers datasets torch sentence-transformers nltk scikit-learn pandas sacremoses easynmt

import feedparser
import torch
import nltk
import json
import os
import numpy as np
import pandas as pd
import re
import requests
import time
import socket
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Tuple
from transformers import (
    AutoModelForSeq2SeqLM,
    T5ForConditionalGeneration,
    T5Tokenizer,
    pipeline
)
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from google.colab import drive
from tqdm.notebook import tqdm
from easynmt import EasyNMT

# Mount Google Drive for persistent storage
drive.mount('/content/drive')

# Create directories for saving data
DATA_DIR = '/content/drive/MyDrive/news_processor_data'
os.makedirs(DATA_DIR, exist_ok=True)

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# ======== RSS FEED CONFIGURATION ========
RSS_FEEDS = [
    "http://feeds.bbci.co.uk/news/world/asia/india/rss.xml",
    "https://www.theguardian.com/world/india/rss",
    "https://timesofindia.indiatimes.com/rssfeedstopstories.cms",
    "https://www.thehindu.com/feeder/default.rss",
    "https://feeds.feedburner.com/ndtvnews-top-stories",
    "https://www.indiatoday.in/rss/home",
    "http://indianexpress.com/print/front-page/feed/",
    "https://www.news18.com/rss/world.xml",
    "https://www.dnaindia.com/feeds/india.xml",
    "https://www.firstpost.com/rss/india.xml",
    "https://www.business-standard.com/rss/home_page_top_stories.rss",
    "https://www.outlookindia.com/rss/main/magazine",
    "https://www.freepressjournal.in/stories.rss",
    "https://www.deccanchronicle.com/rss_feed/",
    "http://www.moneycontrol.com/rss/latestnews.xml",
    "https://economictimes.indiatimes.com/rssfeedsdefault.cms",
    "https://www.oneindia.com/rss/news-fb.xml",
    "http://feeds.feedburner.com/ScrollinArticles.rss",
    "https://www.financialexpress.com/feed/",
    "https://www.thehindubusinessline.com/feeder/default.rss",
    "http://feeds.feedburner.com/techgenyz",
    "https://www.gujaratsamachar.com/rss/top-stories",
    "https://maharashtratimes.com/rssfeedsdefault.cms",
    "https://www.loksatta.com/desh-videsh/feed/",
    "https://lokmat.news18.com/rss/program.xml",
    "https://feeds.feedburner.com/opindia",
    "https://theprint.in/feed/",
    "https://prod-qt-images.s3.amazonaws.com/production/swarajya/feed.xml",
    "https://www.amarujala.com/rss/breaking-news.xml",
    "https://navbharattimes.indiatimes.com/rssfeedsdefault.cms",
    "http://api.patrika.com/rss/india-news",
    "https://www.jansatta.com/feed/",
    "https://feed.livehindustan.com/rss/3127",
    "https://www.bhaskar.com/rss-feed/1061/",
    "https://www.divyabhaskar.co.in/rss-feed/1037/"
]

# Additional Tamil and Telugu feeds
TAMIL_FEEDS = [
    "https://tamil.oneindia.com/rss/tamil-news.xml",
    "https://tamil.samayam.com/rssfeedstopstories.cms",
    "https://www.vikatan.com/feeds/news",
    "https://www.dinamani.com/rss/latest-news.xml"
]

TELUGU_FEEDS = [
    "https://telugu.oneindia.com/rss/telugu-news.xml",
    "https://telugu.samayam.com/rssfeedsdefault.cms",
    "https://www.sakshi.com/rss.xml",
    "https://www.andhrajyothy.com/rss/andhrapradesh.xml"
]

# News API fallback (free tier)
NEWS_API_KEY = "YOUR_NEWS_API_KEY"  # Replace with your News API key if you have one

# Combine all feeds
ALL_FEEDS = RSS_FEEDS + TAMIL_FEEDS + TELUGU_FEEDS

# ======== MULTILINGUAL NEWS PROCESSOR CLASS ========
class MultilingualNewsProcessor:
    def __init__(self, data_dir=DATA_DIR):
        self.data_dir = data_dir
        os.makedirs(data_dir, exist_ok=True)

        print("Loading sentence embedding model...")
        self.embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

        print("Loading summarization model...")
        self.summarizer = pipeline(
            "summarization",
            model="google/flan-t5-small",
            tokenizer="google/flan-t5-small"
        )

        print("Loading translation model...")
        self.translator = EasyNMT('opus-mt')

        # Initialize sentiment analyzer
        self.sia = SentimentIntensityAnalyzer()

        # Supported languages
        self.supported_languages = {
            "en": "English",
            "hi": "Hindi",
            "ta": "Tamil",
            "te": "Telugu"
        }

        # Language detection regex patterns (simplified)
        self.language_patterns = {
            "hi": r'[\u0900-\u097F]',   # Hindi Unicode range
            "ta": r'[\u0B80-\u0BFF]',   # Tamil Unicode range
            "te": r'[\u0C00-\u0C7F]'    # Telugu Unicode range
        }

        # User profiles store
        self.user_profiles = {}
        self.load_user_profiles()

        # News cache
        self.news_cache = None
        self.last_fetch_time = None
        self.cache_expiry = 3600  # 1 hour in seconds

        # Debug mode
        self.debug = True

    def load_user_profiles(self):
        """Load user profiles from disk if they exist"""
        profile_path = os.path.join(self.data_dir, "user_profiles.json")
        if os.path.exists(profile_path):
            try:
                with open(profile_path, 'r', encoding='utf-8') as f:
                    self.user_profiles = json.load(f)
            except Exception as e:
                print(f"Error loading user profiles: {e}")
                self.user_profiles = {}

    def save_user_profiles(self):
        """Save user profiles to disk"""
        profile_path = os.path.join(self.data_dir, "user_profiles.json")
        try:
            with open(profile_path, 'w', encoding='utf-8') as f:
                json.dump(self.user_profiles, f, ensure_ascii=False, indent=2)
        except Exception as e:
            print(f"Error saving user profiles: {e}")

    def detect_language(self, text):
        """Detect language of text using regex patterns"""
        if not text:
            return "en"  # Default to English for empty text

        # Check for language-specific characters
        for lang, pattern in self.language_patterns.items():
            if re.search(pattern, text):
                return lang

        return "en"  # Default to English

    def translate_text(self, text, target_lang="en"):
        """Translate text to target language"""
        if not text:
            return ""

        # Skip translation if text is too short
        if len(text) < 5:
            return text

        source_lang = self.detect_language(text)

        # No translation needed if already in target language
        if source_lang == target_lang:
            return text

        try:
            translated = self.translator.translate(text, target_lang=target_lang)
            return translated
        except Exception as e:
            print(f"Translation error: {e}")
            return text

    def fetch_rss_data(self, feed_urls, cache=True, max_retries=3):
        """Fetch and process RSS feeds with metadata and error handling"""
        current_time = time.time()

        # Check if cache is valid
        if cache and self.news_cache and self.last_fetch_time and (current_time - self.last_fetch_time < self.cache_expiry):
            print("Using cached news data...")
            return self.news_cache

        print("Fetching RSS feeds...")
        articles = []
        successful_feeds = 0

        # Set socket timeout globally
        default_timeout = socket.getdefaulttimeout()
        socket.setdefaulttimeout(15)

        try:
            for url in tqdm(feed_urls):
                for retry in range(max_retries):
                    try:
                        # Parse without timeout parameter (using global socket timeout)
                        feed = feedparser.parse(url)

                        # Check if feed has entries
                        if not hasattr(feed, 'entries') or len(feed.entries) == 0:
                            if retry == max_retries - 1:
                                print(f"Warning: No entries found in {url}")
                            continue

                        source_name = feed.feed.title if hasattr(feed.feed, 'title') else url.split('/')[2]

                        # Detect feed language based on URL or feed metadata
                        feed_language = "en"  # Default to English
                        if any(lang in url.lower() for lang in ["tamil", "dinamani", "vikatan"]):
                            feed_language = "ta"
                        elif any(lang in url.lower() for lang in ["telugu", "sakshi", "andhrajyothy"]):
                            feed_language = "te"
                        elif any(lang in url.lower() for lang in ["hindi", "amarujala", "bhaskar", "jansatta", "hindustan"]):
                            feed_language = "hi"
                        elif any(lang in url.lower() for lang in ["gujarati", "divyabhaskar", "gujaratsamachar"]):
                            feed_language = "gu"  # Gujarati
                        elif any(lang in url.lower() for lang in ["marathi", "maharashtra", "lokmat", "loksatta"]):
                            feed_language = "mr"  # Marathi

                        source_info = {
                            "name": source_name,
                            "url": url,
                            "language": feed_language
                        }

                        successful_feeds += 1
                        entry_count = 0

                        for entry in feed.entries:
                            try:
                                # Extract publication date
                                pub_date = datetime.now().isoformat()
                                if hasattr(entry, 'published_parsed') and entry.published_parsed:
                                    try:
                                        pub_date = datetime(*entry.published_parsed[:6]).isoformat()
                                    except:
                                        pass

                                # Clean title and summary text
                                title = entry.title if hasattr(entry, 'title') else ""
                                title = re.sub(r'<.*?>', '', title)  # Remove HTML tags

                                summary = entry.summary if hasattr(entry, 'summary') else ""
                                summary = re.sub(r'<.*?>', '', summary)  # Remove HTML tags

                                # Extract tags/categories
                                tags = []
                                if hasattr(entry, 'tags'):
                                    tags = [tag.term for tag in entry.tags if hasattr(tag, 'term')]
                                elif hasattr(entry, 'category'):
                                    tags = [entry.category]

                                # Get full content if available
                                content = ""
                                if hasattr(entry, 'content'):
                                    content = entry.content[0].value if hasattr(entry.content[0], 'value') else ""
                                    content = re.sub(r'<.*?>', '', content)  # Remove HTML tags

                                # Confirm article language or detect from content
                                if not feed_language or feed_language == "en":
                                    article_lang = self.detect_language(title + " " + summary)
                                else:
                                    article_lang = feed_language

                                # Skip empty articles
                                if not title and not summary and not content:
                                    continue

                                # Create unique ID
                                article_id = entry.get('id', None)
                                if not article_id:
                                    article_id = f"{source_name}-{hash(title)}"

                                article = {
                                    "id": article_id,
                                    "title": title,
                                    "summary": summary,
                                    "content": content if content else summary,
                                    "link": entry.link if hasattr(entry, 'link') else "",
                                    "source": source_info,
                                    "published_date": pub_date,
                                    "tags": tags,
                                    "language": article_lang,
                                    "sentiment": {},
                                    "entities": []
                                }
                                articles.append(article)
                                entry_count += 1

                            except Exception as e:
                                if self.debug:
                                    print(f"Error processing entry from {url}: {e}")
                                continue

                        print(f"Fetched {entry_count} articles from {url}")
                        break  # Break retry loop if successful

                    except Exception as e:
                        if retry == max_retries - 1:
                            print(f"Error fetching {url} after {max_retries} retries: {e}")
                        else:
                            print(f"Retrying {url} ({retry+2}/{max_retries})...")
                            time.sleep(1)  # Wait before retrying
        finally:
            # Restore default socket timeout
            socket.setdefaulttimeout(default_timeout)

        print(f"Successfully fetched from {successful_feeds}/{len(feed_urls)} feeds")
        print(f"Total articles collected: {len(articles)}")

        # If we didn't get any articles, use fallback mock data for demo purposes
        if len(articles) == 0:
            print("No articles found, using fallback demo data...")
            articles = self.get_fallback_articles()

        # Update cache
        self.news_cache = articles
        self.last_fetch_time = current_time

        # Save to disk for debugging
        try:
            cache_path = os.path.join(self.data_dir, "fetched_news.json")
            with open(cache_path, 'w', encoding='utf-8') as f:
                json.dump({"fetch_time": current_time, "article_count": len(articles)}, f, ensure_ascii=False, indent=2)
        except Exception as e:
            print(f"Error saving fetch stats: {e}")

        return articles

    def get_fallback_articles(self):
        """Generate fallback articles for demonstration when feeds fail"""
        current_date = datetime.now().isoformat()
        fallback_articles = [
            {
                "id": "fallback-cricket-1",
                "title": "India wins cricket match against Australia in thrilling finish",
                "summary": "In a stunning display of skill and determination, the Indian cricket team secured a victory against Australia in the final over of the match. Captain Rohit Sharma led from the front with a century.",
                "content": "In a stunning display of skill and determination, the Indian cricket team secured a victory against Australia in the final over of the match. Captain Rohit Sharma led from the front with a century, while Jasprit Bumrah's exceptional bowling in the death overs sealed the win for India. The match, which took place at the Melbourne Cricket Ground, saw a record attendance of over 90,000 fans. This victory puts India at the top of the series with two more matches to go.",
                "link": "https://www.cricbuzz.com/",
                "source": {"name": "Demo Cricket News", "url": "https://www.cricbuzz.com/", "language": "en"},
                "published_date": current_date,
                "tags": ["cricket", "india", "australia", "sports"],
                "language": "en",
                "sentiment": {"positive": 0.8, "negative": 0.0, "neutral": 0.2, "compound": 0.8}
            },
            {
                "id": "fallback-football-1",
                "title": "Manchester United signs new striker in record deal",
                "summary": "Manchester United has completed the signing of a new striker in a deal worth £80 million, breaking the club's previous transfer record.",
                "content": "Manchester United has completed the signing of a new striker in a deal worth £80 million, breaking the club's previous transfer record. The 23-year-old forward, who scored 30 goals in his previous season, signed a five-year contract with an option for an additional year. The manager expressed his excitement about working with the new talent, stating that he will bring a new dimension to the team's attacking options. Fans have reacted positively to the news, with season ticket sales seeing a significant boost following the announcement.",
                "link": "https://www.manutd.com/",
                "source": {"name": "Demo Football News", "url": "https://www.manutd.com/", "language": "en"},
                "published_date": current_date,
                "tags": ["football", "manchester united", "transfer", "sports"],
                "language": "en",
                "sentiment": {"positive": 0.7, "negative": 0.0, "neutral": 0.3, "compound": 0.7}
            },
            {
                "id": "fallback-football-2",
                "title": "FIFA announces expanded World Cup format for 2026",
                "summary": "FIFA has officially confirmed that the 2026 World Cup will feature 48 teams, an increase from the current 32-team format, with matches to be held across North America.",
                "content": "FIFA has officially confirmed that the 2026 World Cup will feature 48 teams, an increase from the current 32-team format, with matches to be held across the United States, Canada, and Mexico. The new format will include 16 groups of three teams, with the top two from each group advancing to a 32-team knockout stage. This expansion will allow more nations to participate in the tournament, particularly from regions that have historically had fewer representatives. The decision has been met with mixed reactions from football associations around the world, with some praising the inclusivity while others expressing concerns about potentially diluting the competition's quality.",
                "link": "https://www.fifa.com/",
                "source": {"name": "Demo FIFA News", "url": "https://www.fifa.com/", "language": "en"},
                "published_date": current_date,
                "tags": ["football", "FIFA", "World Cup", "sports"],
                "language": "en",
                "sentiment": {"positive": 0.6, "negative": 0.1, "neutral": 0.3, "compound": 0.5}
            },
            {
                "id": "fallback-football-3",
                "title": "Premier League announces new broadcast deal worth billions",
                "summary": "The Premier League has signed a new five-year broadcast deal valued at £10 billion, representing a 20% increase over the previous agreement.",
                "content": "The Premier League has signed a new five-year broadcast deal valued at £10 billion, representing a 20% increase over the previous agreement. The deal, which will begin with the 2024-25 season, includes domestic and international rights across multiple platforms. League officials stated that a portion of the increased revenue will be directed toward improving grassroots football facilities across England. Streaming services will play a larger role in this new deal, reflecting changing viewer habits. Clubs are expected to benefit from increased financial distributions, though discussions about a more equitable distribution model are ongoing.",
                "link": "https://www.premierleague.com/",
                "source": {"name": "Demo Premier League News", "url": "https://www.premierleague.com/", "language": "en"},
                "published_date": current_date,
                "tags": ["football", "Premier League", "broadcast", "sports business"],
                "language": "en",
                "sentiment": {"positive": 0.7, "negative": 0.0, "neutral": 0.3, "compound": 0.7}
            },
            {
                "id": "fallback-football-4",
                "title": "Champions League introduces new format for upcoming season",
                "summary": "UEFA has revealed a new Champions League format that will replace the traditional group stage with a single league phase, giving teams more matches against different opponents.",
                "content": "UEFA has revealed a new Champions League format that will replace the traditional group stage with a single league phase, giving teams more matches against different opponents. Starting next season, each team will play eight matches against eight different opponents in the league phase, with the top eight teams automatically qualifying for the round of 16. Teams finishing between 9th and 24th will compete in a playoff round to determine the remaining spots in the knockout phase. The changes are designed to increase the number of high-stakes matches and eliminate predictable group stage games. Clubs have generally supported the new format, though concerns about player workload have been raised by various players' unions.",
                "link": "https://www.uefa.com/",
                "source": {"name": "Demo UEFA News", "url": "https://www.uefa.com/", "language": "en"},
                "published_date": current_date,
                "tags": ["football", "Champions League", "UEFA", "sports"],
                "language": "en",
                "sentiment": {"positive": 0.5, "negative": 0.2, "neutral": 0.3, "compound": 0.3}
            }
        ]

        # Add fallback articles with summaries
        for article in fallback_articles:
            article['ai_summaries'] = {
                "brief": article['summary'],
                "detailed": article['content']
            }

        return fallback_articles

    def generate_embeddings(self, articles):
        """Generate embeddings for articles with error handling"""
        print("Generating article embeddings...")
        articles_with_embeddings = 0

        for article in tqdm(articles):
            try:
                # For multilingual embedding, use the original text in its language
                text_to_embed = f"{article['title']} {article['summary']}"
                # Skip if text is too short
                if len(text_to_embed.strip()) < 10:
                    continue

                # Ensure embeddings are saved as float32 type
                embedding = self.embedding_model.encode(text_to_embed, convert_to_numpy=True)
                article['embedding'] = embedding.astype(np.float32).tolist()
                articles_with_embeddings += 1
            except Exception as e:
                print(f"Error generating embedding for article '{article.get('title', 'Unknown')}': {e}")

        print(f"Generated embeddings for {articles_with_embeddings}/{len(articles)} articles")
        return articles

    def summarize_articles(self, articles, target_language="en"):
        """Generate summaries for articles with translation if needed"""
        print(f"Generating article summaries in {target_language}...")
        articles_summarized = 0

        for article in tqdm(articles):
            try:
                # Only process articles with enough content
                content = article['content'] if article['content'] else article['summary']
                if len(content) < 100:
                    # Use original content for short articles
                    article['ai_summaries'] = {
                        "brief": content,
                        "detailed": content
                    }
                    continue

                # Translate content to English for summarization if needed
                if article['language'] != "en":
                    try:
                        content_for_summary = self.translate_text(content, "en")
                    except:
                        content_for_summary = content
                else:
                    content_for_summary = content

                # Generate brief summary
                brief_summary = self.summarizer(
                    content_for_summary,
                    max_length=50,
                    min_length=20,
                    do_sample=False
                )[0]['summary_text']

                # Generate detailed summary if content is long enough
                detailed_summary = None
                if len(content_for_summary) > 300:
                    detailed_summary = self.summarizer(
                        content_for_summary,
                        max_length=150,
                        min_length=80,
                        do_sample=False
                    )[0]['summary_text']
                else:
                    detailed_summary = brief_summary

                # Translate summaries to target language if needed
                if target_language != "en":
                    try:
                        brief_summary = self.translate_text(brief_summary, target_language)
                        detailed_summary = self.translate_text(detailed_summary, target_language)
                    except Exception as e:
                        print(f"Translation error: {e}")

                article['ai_summaries'] = {
                    "brief": brief_summary,
                    "detailed": detailed_summary
                }

                # Add sentiment analysis
                article['sentiment'] = self.analyze_sentiment(content_for_summary)
                articles_summarized += 1

            except Exception as e:
                print(f"Error summarizing article '{article.get('title', 'Unknown')}': {e}")
                # Fallback to original content
                article['ai_summaries'] = {
                    "brief": article.get('summary', ''),
                    "detailed": article.get('content', article.get('summary', ''))
                }

        print(f"Summarized {articles_summarized}/{len(articles)} articles")
        return articles

    def analyze_sentiment(self, text):
        """Analyze sentiment of English text using VADER"""
        # Ensure text is in English
        if not text:
            return {"positive": 0, "negative": 0, "neutral": 1.0, "compound": 0}

        try:
            sentiment_scores = self.sia.polarity_scores(text)
            return {
                "positive": sentiment_scores['pos'],
                "negative": sentiment_scores['neg'],
                "neutral": sentiment_scores['neu'],
                "compound": sentiment_scores['compound']
            }
        except Exception as e:
            print(f"Error analyzing sentiment: {e}")
            return {"positive": 0, "negative": 0, "neutral": 1.0, "compound": 0}

    def search_articles_by_query(self, articles, query, target_language="en"):
        """Search articles based on user query and return relevant ones"""
        if not query:
            return articles[:20]  # Return first 20 if no query

        print(f"Searching articles for query: '{query}'")

        # Check for sports-related terms for demo articles
        query_lower = query.lower()
        sports_terms = ["cricket", "football", "soccer", "sport", "fifa", "premier league", "manchester", "world cup"]

        if any(term in query_lower for term in sports_terms) and any("fallback" in article.get("id", "") for article in articles):
            print("Using relevant sports fallback articles")
            # Filter fallback articles for sports content
            relevant_articles = [a for a in articles if "fallback" in a.get("id", "") and
                                any(tag in sports_terms for tag in a.get("tags", []))]
            # Add relevance score for sorting
            for article in relevant_articles:
                article['relevance'] = 0.9 if query_lower in article.get("title", "").lower() else 0.7

            # Sort by relevance
            relevant_articles = sorted(relevant_articles, key=lambda x: x.get('relevance', 0), reverse=True)
            return relevant_articles

        # Translate query to English for consistent searching if needed
        if self.detect_language(query) != "en":
            query_en = self.translate_text(query, "en")
        else:
            query_en = query

        print(f"Searching with query (in English): '{query_en}'")

        # Convert query to embedding and ensure it's float32
        try:
            query_embedding = self.embedding_model.encode(query_en, convert_to_numpy=True).astype(np.float32)
        except Exception as e:
            print(f"Error encoding query: {e}")
            # Return a selection of recent articles if query encoding fails
            sorted_articles = sorted(
                articles,
                key=lambda x: x.get('published_date', ''),
                reverse=True
            )
            return sorted_articles[:20]

        # Calculate similarity with each article
        results = []
        for article in articles:
            if 'embedding' not in article:
                continue

            try:
                # Convert article embedding to float32 numpy array for consistent typing
                article_embedding = np.array(article['embedding'], dtype=np.float32)

                # Convert to torch tensors with same dtype
                query_tensor = torch.tensor([query_embedding], dtype=torch.float32)
                article_tensor = torch.tensor([article_embedding], dtype=torch.float32)

                # Calculate similarity
                similarity = util.cos_sim(query_tensor, article_tensor)[0][0].item()

                # Add similarity score
                article['relevance'] = float(similarity)
                results.append(article)
            except Exception as e:
                print(f"Error calculating similarity for article: {e}")

        # Sort by relevance
        results = sorted(results, key=lambda x: x.get('relevance', 0), reverse=True)

        # Only return reasonably relevant results
        relevant_results = [a for a in results if a.get('relevance', 0) > 0.2]

        # If no relevant results, return some of the results anyway
        if not relevant_results and results:
            relevant_results = results[:min(len(results), 20)]

        print(f"Found {len(relevant_results)} relevant articles")
        return relevant_results[:20]  # Limit to top 20

    def generate_response(self, query, target_language="en", max_articles=5):
        """Generate a response to a user query by processing and summarizing news articles"""
        # Define which feeds to use based on the query
        all_feeds = ALL_FEEDS

        # Fetch and process articles
        articles = self.fetch_rss_data(all_feeds)

        # Skip processing if no articles found
        if not articles:
            no_articles_message = "No news articles could be fetched at this time. Please check your internet connection or try again later."
            if target_language != "en":
                try:
                    no_articles_message = self.translate_text(no_articles_message, target_language)
                except:
                    pass
            return no_articles_message, []

        # Generate embeddings for search
        articles = self.generate_embeddings(articles)

        # Search for relevant articles
        relevant_articles = self.search_articles_by_query(articles, query, target_language)

        # Limit number of articles to process
        relevant_articles = relevant_articles[:max_articles]

        # Skip summarization if no relevant articles
        if not relevant_articles:
            no_results = f"No relevant news articles found for your query: '{query}'."
            if target_language != "en":
                try:
                    no_results = self.translate_text(no_results, target_language)
                except:
                    pass
            return no_results, []

        # Summarize relevant articles
        summarized_articles = self.summarize_articles(relevant_articles, target_language)

        # Combine information into a cohesive response
        response_parts = []

        # Add introduction based on query
        query_translated = query
        if target_language != "en" and self.detect_language(query) != target_language:
            try:
                query_translated = self.translate_text(query, target_language)
            except:
                pass

        intro = f"Here's information about '{query_translated}' from news sources:"
        response_parts.append(intro)

        # Add information from each article
        for i, article in enumerate(summarized_articles):
            article_part = []

            # Article title
            title = article['title']
            if article.get('language') != target_language and article.get('language') != target_language:
                try:
                    title = self.translate_text(title, target_language)
                except:
                    pass

            article_part.append(f"**{i+1}. {title}**")

            # Source and date
            source = article['source']['name']
            try:
                date_str = article['published_date'].split('T')[0]
            except:
                date_str = "Unknown date"

            article_part.append(f"Source: {source} | Date: {date_str}")

            # Summary
            summary = article.get('ai_summaries', {}).get('brief', article.get('summary', ''))
            if summary:
                article_part.append(f"{summary}")

            # Link
            if article.get('link'):
                article_part.append(f"[Read more]({article['link']})")

            # Combine and add to response
            response_parts.append("\n".join(article_part))

        # Add message if no relevant articles found (this is a safeguard, we checked earlier)
        if not summarized_articles:
            no_results = "No relevant news articles found for your query."
            if target_language != "en":
                try:
                    no_results = self.translate_text(no_results, target_language)
                except:
                    pass
            response_parts.append(no_results)

        # Combine all parts
        full_response = "\n\n".join(response_parts)

        return full_response, summarized_articles

# ======== USER INTERFACE FUNCTIONS ========
def process_user_query(processor, query, output_language="en", max_articles=5):
    """Process a user query and return formatted results"""
    # Map language codes to full names for display
    language_names = {
        "en": "English",
        "hi": "Hindi",
        "ta": "Tamil",
        "te": "Telugu"
    }

    print(f"Processing query: '{query}' with output in {language_names.get(output_language, output_language)}")

    try:
        # Generate response
        response, articles = processor.generate_response(
            query=query,
            target_language=output_language,
            max_articles=max_articles
        )

        # Print some stats
        print(f"Found {len(articles)} relevant articles")

        return response
    except Exception as e:
        print(f"Error processing query: {e}")
        error_message = f"An error occurred while processing your query: {str(e)}"
        if output_language != "en":
            try:
                error_message = processor.translate_text(error_message, output_language)
            except:
                pass
        return error_message

# ======== INTERACTIVE INTERFACE ========
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

def create_interactive_interface():
    # Print device information
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Device set to use {device}")

    # Create a MultilingualNewsProcessor instance
    processor = MultilingualNewsProcessor()

    # Create widgets
    query_input = widgets.Text(
        value='',
        placeholder='Enter your news query (e.g., "Latest cricket news")',
        description='Query:',
        layout=widgets.Layout(width='80%')
    )

    language_dropdown = widgets.Dropdown(
        options=[
            ('English', 'en'),
            ('Hindi', 'hi'),
            ('Tamil', 'ta'),
            ('Telugu', 'te')
        ],
        value='en',
        description='Output language:',
    )

    articles_slider = widgets.IntSlider(
        value=5,
        min=1,
        max=10,
        step=1,
        description='Max articles:',
        continuous_update=False
    )

    fetch_button = widgets.Button(
        description='Get News',
        button_style='primary',
        tooltip='Click to fetch news based on your query'
    )

    output_area = widgets.Output()
    status_area = widgets.Output()

    # Define button click behavior
    def on_fetch_button_clicked(b):
        with status_area:
            clear_output()
            print("Processing your request, please wait...")

        query = query_input.value
        output_language = language_dropdown.value
        max_articles = articles_slider.value

        if not query.strip():
            with status_area:
                clear_output()
                print("Please enter a query.")
            return

        with output_area:
            clear_output()
            try:
                response = process_user_query(
                    processor=processor,
                    query=query,
                    output_language=output_language,
                    max_articles=max_articles
                )
                display(HTML(response.replace('\n', '<br>')))
            except Exception as e:
                print(f"Error displaying results: {e}")
                display(HTML(f"<p style='color:red'>Error processing your request: {str(e)}</p>"))

        with status_area:
            clear_output()
            print("Done! Results displayed below.")

    fetch_button.on_click(on_fetch_button_clicked)

    # Layout
    controls = widgets.VBox([
        widgets.HBox([query_input, fetch_button]),
        widgets.HBox([language_dropdown, articles_slider]),
        status_area
    ])

    display(controls)
    display(output_area)

# ======== MAIN EXECUTION ========
print("Initializing Multilingual News Processor...")
create_interactive_interface()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initializing Multilingual News Processor...
Device set to use cpu
Loading sentence embedding model...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Loading summarization model...


Device set to use cpu


Loading translation model...


VBox(children=(HBox(children=(Text(value='', description='Query:', layout=Layout(width='80%'), placeholder='En…

Output()