<a href="https://www.kaggle.com/code/mohamedaymanelkhatib/ai-powered-news-summarizer?scriptVersionId=237465421" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# Install required packages with compatible versions
!pip install -q chromadb==0.6.3
!pip install -q google-generativeai==0.8.2 google-api-core==2.19.2 protobuf==4.25.5
!pip install -q tensorflow==2.17.0
!pip install -q newspaper3k==0.2.8 lxml_html_clean==0.2.2
!pip install -q tenacity==8.5.0 async-timeout==4.0.3

# Verify installations
from importlib.metadata import version
required = {
    'chromadb': '0.6.3',
    'google-generativeai': '0.8.2',
    'newspaper3k': '0.2.8',
    'lxml_html_clean': '0.2.2',
    'tenacity': '8.5.0',
    'async-timeout': '4.0.3'
}
for pkg, expected_version in required.items():
    installed_version = version(pkg)
    assert installed_version == expected_version, f"{pkg} version mismatch: expected {expected_version}, got {installed_version}"
print("Package versions verified successfully!")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 kB[0m [31m5.4 MB/s[0m eta

In [2]:
# Cell 1: Setup and Configuration

# Import necessary libraries
import os
import logging
import json
from datetime import datetime
import numpy as np
import pandas as pd
from typing import List, Dict, Any
from IPython.display import Markdown, display
import requests
import bs4
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import yaml
import shutil
from newspaper import Article  # For news extraction

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration settings
config = {
    'max_articles_per_source': 5,
    'request_timeout': 10,
    'embedding_dimension': 768,
    'cache_dir': './cache',
    'database_path': './vectordb',
    'min_content_length': 100,
    'max_summary_length': 150,
    'batch_size': 100,
    'relevance_threshold': 0.3,
    'max_text_length': 2048,
    'cache_expiry_hours': 24
}

# Create necessary directories
for directory in [config['cache_dir'], config['database_path']]:
    os.makedirs(directory, exist_ok=True)

# Initialize API and models
def initialize_models():
    try:
        import google.generativeai as genai
        from kaggle_secrets import UserSecretsClient
        
        # Get API key securely with fallback
        try:
            GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
        except Exception as e:
            logger.warning(f"Failed to get API key from Kaggle secrets: {e}. Trying environment variable.")
            GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
            if not GOOGLE_API_KEY:
                raise ValueError("Google API key not found. Set GOOGLE_API_KEY in Kaggle secrets or environment.")
        
        genai.configure(api_key=GOOGLE_API_KEY)
        
        # Find available models with fallback
        models = genai.list_models()
        text_model = next((m.name for m in models if "gemini-1.5-pro" in m.name.lower()), 
                         next((m.name for m in models if "gemini" in m.name.lower()), None))
        embedding_model = "models/embedding-001"  # Fixed to known embedding model
        
        if not text_model:
            raise ValueError("No suitable text generation model found.")
        
        # Model configurations
        generation_config = {
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 1024,
        }
        
        safety_settings = [
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
        ]
        
        generation_model = genai.GenerativeModel(
            model_name=text_model,
            generation_config=generation_config,
            safety_settings=safety_settings
        )
        
        logger.info(f"Initialized text model: {text_model}, embedding model: {embedding_model}")
        return generation_model, embedding_model
        
    except Exception as e:
        logger.error(f"Error initializing models: {e}")
        raise

# Run setup
generation_model, embedding_model = initialize_models()
logger.info("Models initialized successfully!")

In [3]:
# Cell 2: Enhanced News Fetching

from tenacity import retry, stop_after_attempt, wait_fixed

class NewsArticleFetcher:
    def __init__(self, config: Dict):
        self.config = config  # Fixed typo (SELF to self)
        self.cache_file = os.path.join(config['cache_dir'], 'news_cache.json')
        self.rss_feeds = {
            'technology': [
                'https://feeds.feedburner.com/TechCrunch/',
                'https://www.wired.com/feed/rss',
                'https://www.theverge.com/rss/index.xml'
            ],
            'business': [
                'https://feeds.bbci.co.uk/news/business/rss.xml',
                'https://www.forbes.com/business/feed/',
                'https://feeds.bloomberg.com/markets/news.rss'
            ],
            'science': [
                'https://rss.nytimes.com/services/xml/rss/nyt/Science.xml',
                'https://www.sciencedaily.com/rss/all.xml',
                'https://www.nature.com/nature.rss'
            ],
            'health': [
                'https://rss.nytimes.com/services/xml/rss/nyt/Health.xml',
                'https://www.who.int/rss-feeds/news-english.xml',
                'https://www.healthline.com/rss/news'
            ]
        }

    def load_cache(self) -> Dict:
        try:
            if os.path.exists(self.cache_file):
                with open(self.cache_file, 'r') as f:
                    cache = json.load(f)
                cache_time = datetime.fromisoformat(cache.get('timestamp', '2000-01-01'))
                if (datetime.now() - cache_time).total_seconds() < self.config['cache_expiry_hours'] * 3600:
                    return cache.get('articles', {})
            return {}
        except Exception as e:
            logger.error(f"Error loading cache: {e}")
            return {}

    def save_cache(self, articles: List[Dict]):
        try:
            cache_data = {
                'timestamp': datetime.now().isoformat(),
                'articles': articles
            }
            with open(self.cache_file, 'w') as f:
                json.dump(cache_data, f)
        except Exception as e:
            logger.error(f"Error saving cache: {e}")

    def clean_text(self, text: str) -> str:
        if not text:
            return ""
        text = ' '.join(text.split())
        text = ''.join(char for char in text if char.isprintable())
        return text

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
    def extract_article_content(self, url: str) -> Dict:
        try:
            article = Article(url)
            article.download()
            article.parse()
            content = self.clean_text(article.text)
            if not content:
                raise ValueError("Empty article content")
            return {
                'content': content,
                'success': True
            }
        except Exception as e:
            logger.warning(f"newspaper3k failed for {url}: {e}. Falling back to BeautifulSoup.")
            response = requests.get(url, timeout=self.config['request_timeout'])
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            content = ' '.join([p.text for p in paragraphs])
            content = self.clean_text(content)
            if not content:
                raise ValueError("Empty article content (BeautifulSoup fallback)")
            return {
                'content': content,
                'success': True
            }

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
    def fetch_news_from_rss(self, rss_url: str, category: str) -> List[Dict]:
        try:
            response = requests.get(rss_url, timeout=self.config['request_timeout'])
            soup = BeautifulSoup(response.content, features="xml")
            items = soup.findAll('item')[:self.config['max_articles_per_source']]
            
            articles = []
            for item in items:
                try:
                    title = item.find('title').text
                    link = item.find('link').text
                    
                    if not link.startswith('http'):
                        link = f"https://{link.lstrip('/')}"
                    
                    description = ""
                    if item.find('description'):
                        description = BeautifulSoup(item.find('description').text, 
                                                 'html.parser').get_text()
                    
                    content_data = self.extract_article_content(link)
                    
                    if content_data['success'] and len(content_data['content']) >= self.config['min_content_length']:
                        articles.append({
                            'title': self.clean_text(title),
                            'link': link,
                            'content': content_data['content'],
                            'description': self.clean_text(description),
                            'category': category,
                            'date': datetime.now().strftime("%Y-%m-%d"),
                            'source': rss_url
                        })
                except Exception as e:
                    logger.warning(f"Error processing article from {rss_url}: {e}")
                    continue
            return articles
        except Exception as e:
            logger.error(f"Error fetching from RSS {rss_url}: {e}")
            return []

    def fetch_all_news(self) -> List[Dict]:
        cached_articles = self.load_cache()
        if cached_articles:
            logger.info("Using cached news data")
            return cached_articles

        all_articles = []
        for category, feeds in tqdm(self.rss_feeds.items(), desc="Fetching news categories"):
            for feed_url in feeds:
                try:
                    articles = self.fetch_news_from_rss(feed_url, category)
                    all_articles.extend(articles)
                    time.sleep(1)  # Rate limiting
                except Exception as e:
                    logger.error(f"Error processing feed {feed_url}: {e}")
                    continue

        self.save_cache(all_articles)
        return all_articles

# Initialize fetcher and get news
news_fetcher = NewsArticleFetcher(config)
all_articles = news_fetcher.fetch_all_news()

# Display statistics
print("\nNews Collection Statistics:")
print(f"Total articles collected: {len(all_articles)}")
category_counts = {}
for article in all_articles:
    category_counts[article['category']] = category_counts.get(article['category'], 0) + 1
print("\nArticles per category:")
for category, count in category_counts.items():
    print(f"- {category}: {count}")

# Display sample article
if all_articles:
    sample_article = all_articles[0]
    print("\nSample Article:")
    print(f"Title: {sample_article['title']}")
    print(f"Category: {sample_article['category']}")
    print(f"Content preview: {sample_article['content'][:200]}...")
else:
    print("\nNo articles collected. Please check the logs for errors.")

Fetching news categories: 100%|██████████| 4/4 [00:30<00:00,  7.60s/it]


News Collection Statistics:
Total articles collected: 45

Articles per category:
- technology: 10
- business: 15
- science: 10
- health: 10

Sample Article:
Title: Top 10 AI Tools That Will Transform Your Content Creation in 2025
Category: technology
Content preview: Top 10 AI Tools That Will Transform Your Content Creation in 2025 Looking to level up your content creation game in 2025? You're in the right place! The digital landscape has evolved dramatically, and...





In [4]:
# Cell 3: Enhanced Summarization System

from tenacity import retry, stop_after_attempt, wait_fixed

class NewsSummarizer:
    def __init__(self, model, config: Dict):
        if not hasattr(model, 'generate_content'):
            raise ValueError("Invalid generation model: missing generate_content method")
        self.model = model
        self.config = config
        self.cache_file = os.path.join(config['cache_dir'], 'summary_cache.json')
        
        self.examples = {
            'technology': {
                'article': "The European Union has approved a new directive aimed at reducing single-use plastics...",
                'summary': "EU bans single-use plastics by 2021 and mandates 90% recycling of plastic bottles by 2029."
            },
            'business': {
                'article': "Apple Inc. reported record-breaking quarterly earnings, with revenue reaching $111.4 billion...",
                'summary': "Apple posts record $111.4B quarterly revenue, driven by iPhone sales in China and services growth."
            },
            'science': {
                'article': "Researchers at Stanford University have developed a new artificial intelligence system...",
                'summary': "Stanford AI system diagnoses pneumonia from X-rays with 95% accuracy, outperforming human radiologists."
            },
            'health': {
                'article': "A large-scale study involving 50,000 participants has found that daily meditation...",
                'summary': "Major study finds daily meditation reduces anxiety by 40%, showing significant mental health benefits."
            }
        }

    def load_cache(self) -> Dict:
        try:
            if os.path.exists(self.cache_file):
                with open(self.cache_file, 'r') as f:
                    return json.load(f)
            return {}
        except Exception as e:
            logger.error(f"Error loading summary cache: {e}")
            return {}

    def save_cache(self, cache_data: Dict):
        try:
            with open(self.cache_file, 'w') as f:
                json.dump(cache_data, f)
        except Exception as e:
            logger.error(f"Error saving summary cache: {e}")

    def create_few_shot_prompt(self, article_content: str, category: str) -> str:
        example = self.examples.get(category, self.examples['technology'])
        prompt = f"""
        Generate a concise, informative summary of the news article. Focus on key facts and main points.
        Keep the summary clear and engaging. Maintain journalistic style.

        Example Article: {example['article']}
        Example Summary: {example['summary']}

        Article: {article_content}
        Summary:"""
        return prompt

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(2))
    def generate_summary(self, article: Dict) -> str:
        try:
            cache = self.load_cache()
            content_hash = hash(article['content'])
            
            if str(content_hash) in cache:
                return cache[str(content_hash)]
            
            # Split content into chunks if too long
            content = article['content']
            max_chunk_size = 4000
            if len(content) > max_chunk_size:
                chunks = [content[i:i+max_chunk_size] for i in range(0, len(content), max_chunk_size)]
                summaries = []
                for chunk in chunks[:2]:  # Limit to 2 chunks
                    prompt = self.create_few_shot_prompt(chunk, article['category'])
                    response = self.model.generate_content(prompt)
                    summaries.append(response.text.strip())
                summary = " ".join(summaries)
            else:
                prompt = self.create_few_shot_prompt(content, article['category'])
                response = self.model.generate_content(prompt)
                summary = response.text.strip()
            
            summary = summary.replace('\n', ' ').strip()
            if len(summary) > self.config['max_summary_length']:
                summary = summary[:self.config['max_summary_length']].rsplit(' ', 1)[0] + '...'
            
            cache[str(content_hash)] = summary
            self.save_cache(cache)
            return summary
        except Exception as e:
            logger.error(f"Error generating summary: {e}")
            sentences = article['content'].split('.')[:3]
            return '. '.join(sentences) + '...'

    def process_articles(self, articles: List[Dict]) -> List[Dict]:
        logger.info("Generating summaries for articles...")
        for article in tqdm(articles, desc="Generating summaries"):
            if 'summary' not in article:
                article['summary'] = self.generate_summary(article)
        return articles

# Initialize summarizer and process articles
summarizer = NewsSummarizer(generation_model, config)
all_articles = summarizer.process_articles(all_articles)

# Display sample summaries
print("\nSample Summaries:")
for i, article in enumerate(all_articles[:3]):
    print(f"\n{i+1}. {article['title']}")
    print(f"Original length: {len(article['content'])} characters")
    print(f"Summary length: {len(article['summary'])} characters")
    print(f"Summary: {article['summary']}")

Generating summaries: 100%|██████████| 45/45 [00:18<00:00,  2.37it/s]


Sample Summaries:

1. Top 10 AI Tools That Will Transform Your Content Creation in 2025
Original length: 9549 characters
Summary length: 148 characters
Summary: This guide highlights 10 AI tools poised to transform content creation in 2025.  ChatGPT excels at content generation and SEO optimization; Canva...

2. LimeWire AI Studio Review 2023: Details, Pricing & Features
Original length: 10336 characters
Summary length: 151 characters
Summary: The infamous file-sharing site LimeWire has relaunched as an AI content creation and publishing platform.  Focusing initially on AI image generation...

3. Top 10 AI Tools in 2023 That Will Make Your Life Easier
Original length: 13203 characters
Summary length: 413 characters
Summary: Top 10 AI Tools in 2023 That Will Make Your Life Easier In this article, we explore the top 10 AI tools that are driving innovation and efficiency in various industries.  These tools are designed to automate repetitive tasks, improve workflow, and increase producti




In [5]:
# Cell 4: Enhanced Embeddings System

import google.generativeai as genai

class ArticleEmbedder:
    def __init__(self, model_name: str, config: Dict):
        self.model_name = model_name
        self.config = config
        self.cache_file = os.path.join(config['cache_dir'], 'embedding_cache.json')
        self.dimension = config['embedding_dimension']

    def load_cache(self) -> Dict:
        try:
            if os.path.exists(self.cache_file):
                with open(self.cache_file, 'r') as f:
                    cache = json.load(f)
                    return {k: np.array(v) for k, v in cache.items()}
            return {}
        except Exception as e:
            logger.error(f"Error loading embedding cache: {e}")
            return {}

    def save_cache(self, cache_data: Dict):
        try:
            serializable_cache = {k: v.tolist() for k, v in cache_data.items()}
            with open(self.cache_file, 'w') as f:
                json.dump(serializable_cache, f)
        except Exception as e:
            logger.error(f"Error saving embedding cache: {e}")

    def prepare_text_for_embedding(self, text: str) -> str:
        text = text.replace('\n', ' ').strip()
        if len(text) > self.config['max_text_length']:
            text = text[:self.config['max_text_length']//2] + " ... " + text[-self.config['max_text_length']//2:]
        return text

    def generate_embedding(self, text: str) -> np.ndarray:
        try:
            text = self.prepare_text_for_embedding(text)
            result = genai.embed_content(model=self.model_name, content=text)
            embedding = np.array(result['embedding'])
            if embedding.shape[0] != self.dimension:
                logger.warning(f"Embedding dimension mismatch: expected {self.dimension}, got {embedding.shape[0]}")
            return embedding
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return np.zeros(self.dimension)  # Zero vector fallback

    def get_embedding(self, text: str) -> np.ndarray:
        cache = self.load_cache()
        text_hash = hash(text)
        
        if str(text_hash) in cache:
            return cache[str(text_hash)]
        
        embedding = self.generate_embedding(text)
        cache[str(text_hash)] = embedding
        self.save_cache(cache)
        return embedding

    def process_articles(self, articles: List[Dict]) -> List[Dict]:
        logger.info("Generating embeddings for articles...")
        for article in tqdm(articles, desc="Generating embeddings"):
            if 'embedding' not in article:
                combined_text = f"{article['title']} {article.get('summary', '')} {article['content'][:1000]}"
                article['embedding'] = self.get_embedding(combined_text)
        return articles

    def calculate_similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
        try:
            dot_product = np.dot(embedding1, embedding2)
            norm1 = np.linalg.norm(embedding1)
            norm2 = np.linalg.norm(embedding2)
            return dot_product / (norm1 * norm2) if norm1 * norm2 != 0 else 0
        except Exception as e:
            logger.error(f"Error calculating similarity: {e}")
            return 0

# Initialize embedder and process articles
embedder = ArticleEmbedder(embedding_model, config)
all_articles = embedder.process_articles(all_articles)

# Demonstrate embedding similarity
print("\nDemonstrating Embedding Similarities:")
if len(all_articles) >= 2:
    article1 = all_articles[0]
    print(f"\nReference Article: {article1['title']}")
    
    similarities = []
    for i, article2 in enumerate(all_articles[1:], 1):
        similarity = embedder.calculate_similarity(article1['embedding'], article2['embedding'])
        similarities.append((i, similarity, article2['title']))
    
    print("\nMost Similar Articles:")
    for idx, similarity, title in sorted(similarities, key=lambda x: x[1], reverse=True)[:3]:
        print(f"- {title} (Similarity: {similarity:.4f})")

Generating embeddings: 100%|██████████| 45/45 [00:15<00:00,  2.95it/s]


Demonstrating Embedding Similarities:

Reference Article: Top 10 AI Tools That Will Transform Your Content Creation in 2025

Most Similar Articles:
- Top 10 AI Tools in 2023 That Will Make Your Life Easier (Similarity: 0.8835)
- Top 10 AI Content Generator & Writer Tools in 2022 (Similarity: 0.8002)
- LimeWire AI Studio Review 2023: Details, Pricing & Features (Similarity: 0.7062)





In [6]:
# Cell 5: NewsRAGSystem

import chromadb
from chromadb.config import Settings
import numpy as np
from typing import List, Dict
from tqdm import tqdm
import time

class NewsRAGSystem:
    def __init__(self):
        try:
            # Initialize persistent ChromaDB client
            self.client = chromadb.PersistentClient(path=config['database_path'])
            logger.info("Successfully initialized persistent ChromaDB client")
            
            self.collection_name = f"news_articles_{int(time.time())}"
            self.collection = self.client.create_collection(
                name=self.collection_name,
                metadata={"description": "News articles collection"}
            )
            logger.info(f"Created new collection: {self.collection_name}")
            
            self.category_keywords = {
                'technology': ['artificial intelligence', 'software', 'tech', 'AI', 'machine learning'],
                'health': ['medical', 'healthcare', 'health', 'treatment', 'medicine'],
                'business': ['market', 'business', 'finance', 'investment', 'company'],
                'science': ['research', 'scientific', 'discovery', 'study', 'scientist']
            }
            
        except Exception as e:
            logger.error(f"Error initializing ChromaDB client: {e}")
            raise

    def add_articles(self, articles: List[Dict]):
        try:
            # Validate embeddings
            ids = []
            embeddings = []
            metadatas = []
            documents = []
            for i, article in enumerate(articles):
                if 'embedding' in article and article['embedding'].shape[0] == config['embedding_dimension']:
                    ids.append(str(i))
                    embeddings.append(article['embedding'].tolist())
                    metadatas.append({
                        "title": article['title'],
                        "category": article['category'],
                        "date": article.get('date', ''),
                        "link": article.get('link', '#'),
                        "summary": article.get('summary', '')
                    })
                    documents.append(article['content'])
            
            # Add to collection in batches
            batch_size = config['batch_size']
            for i in range(0, len(ids), batch_size):
                batch_end = min(i + batch_size, len(ids))
                self.collection.add(
                    ids=ids[i:batch_end],
                    embeddings=embeddings[i:batch_end],
                    metadatas=metadatas[i:batch_end],
                    documents=documents[i:batch_end]
                )
            logger.info(f"Added {len(ids)} articles to the collection")
            
        except Exception as e:
            logger.error(f"Error adding articles to ChromaDB: {e}")
            raise

    def get_personalized_recommendations(self, user_preferences: List[str], num_results: int = 5) -> List[Dict]:
        try:
            # Limit keyword expansion
            expanded_preferences = user_preferences.copy()
            for pref in user_preferences:
                for category, keywords in self.category_keywords.items():
                    if any(keyword in pref.lower() for keyword in keywords):
                        expanded_preferences.extend(keywords[:3])  # Top 3 keywords
            
            query = " ".join(list(set(expanded_preferences)))
            query_embedding = embedder.get_embedding(query)
            
            results = self.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=min(num_results * 2, self.collection.count()),
                include=["metadatas", "documents", "distances"]
            )
            
            recommendations = []
            if results['distances'] and results['distances'][0]:
                distances = np.array(results['distances'][0])
                max_dist = np.max(distances)
                min_dist = np.min(distances)
                if max_dist == min_dist:
                    similarity_scores = np.ones_like(distances) * 100
                else:
                    similarity_scores = (1 - (distances - min_dist) / (max_dist - min_dist)) * 100
                
                for i in range(len(results['ids'][0])):
                    metadata = results['metadatas'][0][i]
                    content = results['documents'][0][i]
                    content_lower = content.lower()
                    title_lower = metadata['title'].lower()
                    matches_preferences = any(
                        pref.lower() in content_lower or pref.lower() in title_lower 
                        for pref in expanded_preferences
                    )
                    
                    if matches_preferences:
                        recommendations.append({
                            "title": metadata['title'],
                            "category": metadata['category'],
                            "content": content,
                            "summary": metadata.get('summary', ''),
                            "link": metadata.get('link', '#'),
                            "relevance_score": float(similarity_scores[i])
                        })
            
            recommendations.sort(key=lambda x: x['relevance_score'], reverse=True)
            return recommendations[:num_results]
            
        except Exception as e:
            logger.error(f"Error getting recommendations: {e}")
            return []

# Initialize RAG system
try:
    print("Initializing RAG system...")
    rag_system = NewsRAGSystem()
    
    print("Adding articles to RAG system...")
    rag_system.add_articles(all_articles)
    
    user_profiles = {
        "tech_enthusiast": ["artificial intelligence", "software development", "tech startups"],
        "health_conscious": ["medical research", "healthcare innovation", "wellness"],
        "business_analyst": ["market trends", "economic policy", "business strategy"]
    }

    print("\nTesting personalized recommendations:")
    for profile, interests in user_profiles.items():
        print(f"\nRecommendations for {profile}:")
        recommendations = rag_system.get_personalized_recommendations(interests)
        for rec in recommendations:
            print(f"- {rec['title']} ({rec['category']}) - Relevance: {rec['relevance_score']:.2f}%")

except Exception as e:
    logger.error(f"Error in RAG system setup: {e}")
    print(f"Failed to initialize RAG system: {str(e)}")

Initializing RAG system...
Adding articles to RAG system...

Testing personalized recommendations:

Recommendations for tech_enthusiast:
- Top 10 AI Content Generator & Writer Tools in 2022 (technology) - Relevance: 100.00%
- Top 10 AI Tools in 2023 That Will Make Your Life Easier (technology) - Relevance: 97.43%
- A DOGE Recruiter Is Staffing a Project to Deploy AI Agents Across the US Government (technology) - Relevance: 87.08%
- Top 10 AI Tools That Will Transform Your Content Creation in 2025 (technology) - Relevance: 73.77%
- LimeWire AI Studio Review 2023: Details, Pricing & Features (technology) - Relevance: 71.41%

Recommendations for health_conscious:
- GOARN marks 25 years of advancing global health emergency preparedness and response (health) - Relevance: 100.00%
- RFK Jr. Orders Search for New Measles Treatments Instead of Urging Vaccination (health) - Relevance: 58.92%
- U.S. Prosecutors Accuse Large Insurers of Paying Kickbacks for Private Medicare Plans (health) - Releva

In [7]:
# Cell 6: Enhanced Interactive Dashboard

from IPython.display import HTML, display
import re
from collections import Counter

class NewsDashboard:
    def __init__(self, rag_system: NewsRAGSystem):
        self.rag_system = rag_system
        
    def create_article_card(self, article: Dict, index: int) -> str:
        link = article.get('link', '#')
        if not link.startswith('http') and link != '#':
            link = f"https://{link.lstrip('/')}"
        
        summary = article.get('summary', 'No summary available.')
        relevance_percentage = article['relevance_score']
        
        return f"""
        <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 5px; background-color: white;">
            <h3>{index}. {article['title']}</h3>
            <div style="margin: 10px 0;">
                <span style="background: #e1e1e1; padding: 5px 10px; border-radius: 15px; margin-right: 10px;">
                    <strong>Category:</strong> {article['category']}
                </span>
                <span style="background: #e1e1e1; padding: 5px 10px; border-radius: 15px;">
                    <strong>Relevance:</strong> {relevance_percentage:.1f}%
                </span>
            </div>
            <p style="color: #666;">{summary}</p>
            <a href="{link}" target="_blank" style="color: #0366d6; text-decoration: none;">
                Read full article →
            </a>
        </div>
        """

    def create_trending_topics_section(self, articles: List[Dict]) -> str:
        all_text = ' '.join([
            a['title'] + ' ' + a.get('summary', '') 
            for a in articles
        ])
        
        words = re.findall(r'\w+', all_text.lower())
        stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
        words = [w for w in words if w not in stop_words and len(w) > 3]
        
        topics = Counter(words).most_common(10)
        
        topics_html = """
        <div style="margin: 20px 0; padding: 15px; background-color: white; border-radius: 5px; border: 1px solid #ddd;">
            <h3 style="margin-bottom: 15px;">📈 Trending Topics</h3>
            <div style="display: flex; flex-wrap: wrap; gap: 10px;">
        """
        
        for word, count in topics:
            topics_html += f"""
            <span style="background: #f0f0f0; padding: 5px 15px; border-radius: 20px; 
            font-size: 14px; color: #444;">
                {word} ({count})
            </span>
            """
            
        topics_html += "</div></div>"
        return topics_html

    def display_personalized_feed(self, user_preferences: List[str]):
        recommendations = self.rag_system.get_personalized_recommendations(user_preferences)
        
        dashboard_html = f"""
        <div style="max-width: 800px; margin: 0 auto; font-family: Arial, sans-serif;">
            <h2 style="color: #333;">Your Personalized News Feed</h2>
            <p style="color: #666;">Based on interests: {', '.join(user_preferences)}</p>
        """
        
        dashboard_html += self.create_trending_topics_section(recommendations)
        
        dashboard_html += "<div style='margin-top: 20px;'>"
        for i, article in enumerate(recommendations, 1):
            dashboard_html += self.create_article_card(article, i)
        dashboard_html += "</div></div>"
        
        display(HTML(dashboard_html))

# Initialize and display dashboard
print("Initializing News Dashboard...")
dashboard = NewsDashboard(rag_system)

for profile, interests in user_profiles.items():
    print(f"\nDashboard for {profile}")
    dashboard.display_personalized_feed(interests)

Initializing News Dashboard...

Dashboard for tech_enthusiast



Dashboard for health_conscious



Dashboard for business_analyst


In [8]:
# Cell 7: System Evaluation and Analytics

import random

class NewsSystemEvaluator:
    def __init__(self, generation_model):
        self.model = generation_model
        self.metrics = {
            'summary_quality': [],
            'recommendation_relevance': [],
            'system_performance': {}
        }
        logger.warning("Using same model for evaluation may introduce bias. Consider external validation.")

    def evaluate_summary(self, article: Dict, mock: bool = False) -> Dict:
        if mock:
            logger.info(f"Mock evaluation for {article['title']}")
            return {
                'article_title': article['title'],
                'evaluation_text': (
                    "Accuracy Score: 8\n"
                    "Conciseness Score: 7\n"
                    "Clarity Score: 8\n"
                    "Overall Score: 7.7\n"
                    "Feedback: Mock evaluation for testing."
                ),
                'timestamp': datetime.now().isoformat(),
                'success': True
            }
        
        try:
            prompt = f"""
            Evaluate this news summary on the following criteria (score 1-10):
            1. Accuracy: Does it capture the main points?
            2. Conciseness: Is it appropriately brief?
            3. Clarity: Is it easy to understand?

            Original Article (excerpt): {article['content'][:500]}...
            Summary: {article.get('summary', 'No summary available')}

            Format your response as:
            Accuracy Score: [1-10]
            Conciseness Score: [1-10]
            Clarity Score: [1-10]
            Overall Score: [average]
            Feedback: [brief feedback]
            """
            
            response = self.model.generate_content(prompt)
            
            return {
                'article_title': article['title'],
                'evaluation_text': response.text,
                'timestamp': datetime.now().isoformat(),
                'success': True
            }
        except Exception as e:
            logger.error(f"Error evaluating summary: {e}")
            return {
                'article_title': article.get('title', 'Unknown'),
                'evaluation_text': f"Evaluation failed: {str(e)}. Check quota at https://console.cloud.google.com/iam-admin/quotas.",
                'timestamp': datetime.now().isoformat(),
                'success': False
            }

    def evaluate_recommendations(self, user_preferences: List[str], recommendations: List[Dict]) -> Dict:
        try:
            relevance_scores = [rec.get('relevance_score', 0) for rec in recommendations]
            return {
                'average_relevance': np.mean(relevance_scores) if relevance_scores else 0,
                'preference_coverage': len(set(user_preferences)),
                'recommendation_count': len(recommendations),
                'success': True
            }
        except Exception as e:
            logger.error(f"Error evaluating recommendations: {e}")
            return {
                'error': str(e),
                'success': False
            }

    def generate_report(self, mock_evaluation: bool = False):
        try:
            if not all_articles:
                raise ValueError("No articles available for evaluation. Check news fetching.")

            # Check for cached metrics
            metrics_file = os.path.join(config['cache_dir'], 'evaluation_metrics.json')
            if os.path.exists(metrics_file) and not mock_evaluation:
                with open(metrics_file, 'r') as f:
                    cached_metrics = json.load(f)
                logger.info("Using cached evaluation metrics to conserve quota.")
                self.metrics = cached_metrics
                report = "# News System Evaluation Report\n\nUsing cached metrics due to quota limits.\n"
                report += "## Summary Quality Analysis\n"
                for i, eval_data in enumerate(self.metrics['summary_quality']):
                    report += f"\n### Summary {i+1}: {eval_data['article_title']}\n"
                    report += f"Evaluation Results:\n{eval_data['evaluation_text']}\n"
                    report += f"Timestamp: {eval_data['timestamp']}\n"
                    report += "-" * 50 + "\n"
                report += "\n## Recommendation Performance Analysis\n"
                for rec_eval in self.metrics['recommendation_relevance']:
                    report += f"\n### Profile: {rec_eval['profile']}\n"
                    metrics = rec_eval['metrics']
                    if metrics.get('success', False):
                        report += f"- Average Relevance: {metrics['average_relevance']:.2f}\n"
                        report += f"- Preference Coverage: {metrics['preference_coverage']}\n"
                        report += f"- Recommendations: {metrics['recommendation_count']}\n"
                    else:
                        report += f"- Evaluation failed: {metrics.get('error', 'Unknown error')}\n"
                report += "\n## System Performance Metrics\n"
                for key, value in self.metrics['system_performance'].items():
                    report += f"- {key}: {value}\n"
                display(Markdown(report))
                return

            # Evaluate summaries
            print("Evaluating summaries...")
            sample_size = min(2, len(all_articles))  # Reduced to conserve quota
            sample_articles = random.sample(all_articles, sample_size) if sample_size <= len(all_articles) else all_articles
            for article in tqdm(sample_articles, desc="Evaluating summaries"):
                evaluation = self.evaluate_summary(article, mock=mock_evaluation)
                self.metrics['summary_quality'].append(evaluation)

            # Evaluate recommendations
            print("\nEvaluating recommendations...")
            for profile, interests in user_profiles.items():
                recommendations = rag_system.get_personalized_recommendations(interests)
                self.metrics['recommendation_relevance'].append({
                    'profile': profile,
                    'metrics': self.evaluate_recommendations(interests, recommendations)
                })

            # Generate report
            report = """
            # News System Evaluation Report

            ## Summary Quality Analysis
            """
            
            for i, eval_data in enumerate(self.metrics['summary_quality']):
                report += f"\n### Summary {i+1}: {eval_data['article_title']}\n"
                report += f"Evaluation Results:\n{eval_data['evaluation_text']}\n"
                report += f"Timestamp: {eval_data['timestamp']}\n"
                report += "-" * 50 + "\n"

            report += "\n## Recommendation Performance Analysis\n"
            for rec_eval in self.metrics['recommendation_relevance']:
                report += f"\n### Profile: {rec_eval['profile']}\n"
                metrics = rec_eval['metrics']
                if metrics.get('success', False):
                    report += f"- Average Relevance: {metrics['average_relevance']:.2f}\n"
                    report += f"- Preference Coverage: {metrics['preference_coverage']}\n"
                    report += f"- Recommendations: {metrics['recommendation_count']}\n"
                else:
                    report += f"- Evaluation failed: {metrics.get('error', 'Unknown error')}\n"

            self.metrics['system_performance'] = {
                'total_articles': len(all_articles),
                'evaluation_timestamp': datetime.now().isoformat(),
                'successful_summaries': sum(1 for e in self.metrics['summary_quality'] if e['success']),
                'successful_recommendations': sum(1 for e in self.metrics['recommendation_relevance'] 
                                               if e['metrics'].get('success', False))
            }

            report += "\n## System Performance Metrics\n"
            for key, value in self.metrics['system_performance'].items():
                report += f"- {key}: {value}\n"

            display(Markdown(report))
            self.save_metrics()

        except Exception as e:
            logger.error(f"Error generating report: {e}")
            display(Markdown(f"# Error Generating Report\nAn error occurred: {str(e)}"))

    def save_metrics(self):
        try:
            metrics_file = os.path.join(config['cache_dir'], 'evaluation_metrics.json')
            with open(metrics_file, 'w') as f:
                metrics_dict = json.loads(json.dumps(self.metrics, default=str))
                json.dump(metrics_dict, f, indent=2)
            logger.info(f"Metrics saved to {metrics_file}")
        except Exception as e:
            logger.error(f"Error saving metrics: {e}")

# Run evaluation
try:
    print("Starting system evaluation...")
    evaluator = NewsSystemEvaluator(generation_model)
    evaluator.generate_report(mock_evaluation=False)  # Use mock mode to avoid API calls
    print("Evaluation completed successfully!")
except Exception as e:
    logger.error(f"Fatal error in evaluation: {e}")
    print(f"Evaluation failed: {str(e)}")

Starting system evaluation...
Evaluating summaries...


Evaluating summaries: 100%|██████████| 2/2 [00:02<00:00,  1.21s/it]


Evaluating recommendations...






            # News System Evaluation Report

            ## Summary Quality Analysis
            
### Summary 1: Trump gutted two landmark environmental reports — can researchers save them?
Evaluation Results:
Evaluation failed: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 36
}
]. Check quota at https://console.cloud.google.com/iam-admin/quotas.
Timestamp: 2025-05-02T21:15:24.019571
--------------------------------------------------

### Summary 2: Trump’s 2026 Budget Proposes Cutting NASA Funding by $6 Billion
Evaluation Results:
Accuracy Score: 9
Conciseness Score: 10
Clarity Score: 10
Overall Score: 9.7
Feedback: The summary accurately captures the main points of the excerpt: the shift in NASA's focus and the proposed budget cut.  It's concise and easy to understand.  It could have included the increase in Mars funding, but given the focus on the overall budget reduction, this omission doesn't significantly detract.

Timestamp: 2025-05-02T21:15:26.324736
--------------------------------------------------

## Recommendation Performance Analysis

### Profile: tech_enthusiast
- Average Relevance: 85.94
- Preference Coverage: 3
- Recommendations: 5

### Profile: health_conscious
- Average Relevance: 52.43
- Preference Coverage: 3
- Recommendations: 5

### Profile: business_analyst
- Average Relevance: 73.77
- Preference Coverage: 3
- Recommendations: 5

## System Performance Metrics
- total_articles: 45
- evaluation_timestamp: 2025-05-02T21:15:26.385700
- successful_summaries: 1
- successful_recommendations: 3


Evaluation completed successfully!
