<a href="https://www.kaggle.com/code/mohamedaymanelkhatib/ai-powered-news-summarizer?scriptVersionId=232556407" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install -q chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 kB[0m [31m4.2 MB/s[0m eta [

In [2]:
# Cell 1: Setup and Configuration

# Import necessary libraries
import os
import logging
import json
from datetime import datetime
import numpy as np
import pandas as pd
from typing import List, Dict, Any
from IPython.display import Markdown, display
import requests
import bs4
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import yaml
import shutil

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Configuration settings
config = {
    'max_articles_per_source': 5,
    'request_timeout': 10,
    'embedding_dimension': 768,
    'cache_dir': './cache',
    'database_path': './vectordb',
    'min_content_length': 100,
    'max_summary_length': 150,
    'batch_size': 100,
    'relevance_threshold': 0.3,
    'max_text_length': 2048,
    'cache_expiry_hours': 24
}

# Create necessary directories
for directory in [config['cache_dir'], config['database_path']]:
    os.makedirs(directory, exist_ok=True)

# Install required packages
def setup_environment():
    try:
        !pip install -q "google-generativeai>=0.3.1" "chromadb>=0.6.3" "requests" \
            "beautifulsoup4" "lxml[html_clean]" "newspaper3k" "pyyaml" "tqdm"
        logger.info("Packages installed successfully")
    except Exception as e:
        logger.error(f"Error installing packages: {e}")
        raise

# Initialize API and models
def initialize_models():
    try:
        import google.generativeai as genai
        from kaggle_secrets import UserSecretsClient
        
        # Get API key securely
        GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
        genai.configure(api_key=GOOGLE_API_KEY)
        
        # Find available models
        models = genai.list_models()
        text_model = next((m.name for m in models if "gemini-pro" in m.name.lower()), "gemini-1.0-pro")
        embedding_model = next((m.name for m in models if "embedding" in m.name.lower()), "embedding-001")
        
        # Model configurations
        generation_config = {
            "temperature": 0.7,
            "top_p": 0.95,
            "top_k": 40,
            "max_output_tokens": 1024,
        }
        
        safety_settings = [
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
        ]
        
        generation_model = genai.GenerativeModel(
            model_name=text_model,
            generation_config=generation_config,
            safety_settings=safety_settings
        )
        
        return generation_model, embedding_model
        
    except Exception as e:
        logger.error(f"Error initializing models: {e}")
        raise

# Run setup
setup_environment()
generation_model, embedding_model = initialize_models()

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.7/149.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [3]:
# Cell 2: Enhanced News Fetching

class NewsArticleFetcher:
    def __init__(self, config: Dict):
        self.config = config
        self.cache_file = os.path.join(config['cache_dir'], 'news_cache.json')
        self.rss_feeds = {
            'technology': [
                'https://feeds.feedburner.com/TechCrunch/',
                'https://www.wired.com/feed/rss',
                'https://www.theverge.com/rss/index.xml'
            ],
            'business': [
                'https://feeds.marketwatch.com/marketwatch/topstories/',
                'https://www.forbes.com/business/feed/',
                'https://www.ft.com/rss/home'
            ],
            'science': [
                'https://rss.nytimes.com/services/xml/rss/nyt/Science.xml',
                'https://www.sciencedaily.com/rss/all.xml',
                'https://www.nature.com/nature.rss'
            ],
            'health': [
                'https://rss.nytimes.com/services/xml/rss/nyt/Health.xml',
                'https://www.who.int/rss-feeds/news-english.xml',
                'https://www.healthline.com/rss/news'
            ]
        }

    def load_cache(self) -> Dict:
        try:
            if os.path.exists(self.cache_file):
                with open(self.cache_file, 'r') as f:
                    cache = json.load(f)
                # Check cache expiry
                cache_time = datetime.fromisoformat(cache.get('timestamp', '2000-01-01'))
                if (datetime.now() - cache_time).total_seconds() < self.config['cache_expiry_hours'] * 3600:
                    return cache.get('articles', {})
            return {}
        except Exception as e:
            logger.error(f"Error loading cache: {e}")
            return {}

    def save_cache(self, articles: List[Dict]):
        try:
            cache_data = {
                'timestamp': datetime.now().isoformat(),
                'articles': articles
            }
            with open(self.cache_file, 'w') as f:
                json.dump(cache_data, f)
        except Exception as e:
            logger.error(f"Error saving cache: {e}")

    def clean_text(self, text: str) -> str:
        """Clean and normalize text content."""
        if not text:
            return ""
        # Remove extra whitespace
        text = ' '.join(text.split())
        # Remove special characters but keep punctuation
        text = ''.join(char for char in text if char.isprintable())
        return text

    def extract_article_content(self, url: str) -> Dict:
        """Extract article content using newspaper3k or fallback method."""
        try:
            if 'Article' in globals():
                article = Article(url)
                article.download()
                article.parse()
                return {
                    'content': self.clean_text(article.text),
                    'success': True
                }
            else:
                response = requests.get(url, timeout=self.config['request_timeout'])
                soup = BeautifulSoup(response.content, 'html.parser')
                paragraphs = soup.find_all('p')
                content = ' '.join([p.text for p in paragraphs])
                return {
                    'content': self.clean_text(content),
                    'success': True
                }
        except Exception as e:
            logger.warning(f"Error extracting content from {url}: {e}")
            return {
                'content': '',
                'success': False
            }

    def fetch_news_from_rss(self, rss_url: str, category: str) -> List[Dict]:
        """Fetch news articles from RSS feed."""
        try:
            response = requests.get(rss_url, timeout=self.config['request_timeout'])
            soup = BeautifulSoup(response.content, features="xml")
            items = soup.findAll('item')[:self.config['max_articles_per_source']]
            
            articles = []
            for item in items:
                try:
                    title = item.find('title').text
                    link = item.find('link').text
                    
                    # Ensure link is properly formatted
                    if not link.startswith('http'):
                        link = f"https://{link}"
                    
                    # Get description
                    description = ""
                    if item.find('description'):
                        description = BeautifulSoup(item.find('description').text, 
                                                  'html.parser').get_text()
                    
                    # Extract content
                    content_data = self.extract_article_content(link)
                    
                    if content_data['success'] and len(content_data['content']) >= self.config['min_content_length']:
                        articles.append({
                            'title': self.clean_text(title),
                            'link': link,
                            'content': content_data['content'],
                            'description': self.clean_text(description),
                            'category': category,
                            'date': datetime.now().strftime("%Y-%m-%d"),
                            'source': rss_url
                        })
                    
                except Exception as e:
                    logger.warning(f"Error processing article from {rss_url}: {e}")
                    continue
                
            return articles
            
        except Exception as e:
            logger.error(f"Error fetching from RSS {rss_url}: {e}")
            return []

    def fetch_all_news(self) -> List[Dict]:
        """Fetch news from all sources with caching."""
        # Check cache first
        cached_articles = self.load_cache()
        if cached_articles:
            logger.info("Using cached news data")
            return cached_articles

        all_articles = []
        
        # Fetch from all RSS feeds
        for category, feeds in tqdm(self.rss_feeds.items(), desc="Fetching news categories"):
            for feed_url in feeds:
                try:
                    articles = self.fetch_news_from_rss(feed_url, category)
                    all_articles.extend(articles)
                    time.sleep(1)  # Rate limiting
                except Exception as e:
                    logger.error(f"Error processing feed {feed_url}: {e}")
                    continue

        # Save to cache
        self.save_cache(all_articles)
        
        return all_articles

# Initialize fetcher and get news
news_fetcher = NewsArticleFetcher(config)
all_articles = news_fetcher.fetch_all_news()

# Display statistics
print("\nNews Collection Statistics:")
print(f"Total articles collected: {len(all_articles)}")
category_counts = {}
for article in all_articles:
    category_counts[article['category']] = category_counts.get(article['category'], 0) + 1
print("\nArticles per category:")
for category, count in category_counts.items():
    print(f"- {category}: {count}")

# Display sample article
if all_articles:
    sample_article = all_articles[0]
    print("\nSample Article:")
    print(f"Title: {sample_article['title']}")
    print(f"Category: {sample_article['category']}")
    print(f"Content preview: {sample_article['content'][:200]}...")
else:
    print("\nNo articles collected. Please check the logs for errors.")

Fetching news categories: 100%|██████████| 4/4 [00:24<00:00,  6.24s/it]


News Collection Statistics:
Total articles collected: 25

Articles per category:
- technology: 10
- business: 5
- science: 5
- health: 5

Sample Article:
Title: Top 10 AI Tools That Will Transform Your Content Creation in 2025
Category: technology
Content preview: Looking to level up your content creation game in 2025? You're in the right place! The digital landscape has evolved dramatically, and AI tools have become essential for creators who want to stay ahea...





In [4]:
# Cell 3: Enhanced Summarization System

class NewsSummarizer:
    def __init__(self, model, config: Dict):
        self.model = model
        self.config = config
        self.cache_file = os.path.join(config['cache_dir'], 'summary_cache.json')
        
        # Few-shot examples for different types of news
        self.examples = {
            'technology': {
                'article': "The European Union has approved a new directive aimed at reducing single-use plastics. The directive will ban items such as plastic straws, cutlery, and cotton buds by 2021. Member states will also be required to collect and recycle 90% of plastic bottles by 2029.",
                'summary': "EU bans single-use plastics by 2021 and mandates 90% recycling of plastic bottles by 2029."
            },
            'business': {
                'article': "Apple Inc. reported record-breaking quarterly earnings, with revenue reaching $111.4 billion. iPhone sales drove the surge, particularly in China where the new 5G models have been extremely popular. The company's services division also saw significant growth.",
                'summary': "Apple posts record $111.4B quarterly revenue, driven by iPhone sales in China and services growth."
            },
            'science': {
                'article': "Researchers at Stanford University have developed a new artificial intelligence system capable of diagnosing pneumonia from chest X-rays more accurately than radiologists. The system, trained on over 100,000 X-ray images, achieved a 95% accuracy rate compared to the 92% average for human experts.",
                'summary': "Stanford AI system diagnoses pneumonia from X-rays with 95% accuracy, outperforming human radiologists."
            },
            'health': {
                'article': "A large-scale study involving 50,000 participants has found that daily meditation can reduce anxiety levels by up to 40%. The research, conducted over five years, showed significant improvements in mental health metrics among regular practitioners.",
                'summary': "Major study finds daily meditation reduces anxiety by 40%, showing significant mental health benefits."
            }
        }

    def load_cache(self) -> Dict:
        try:
            if os.path.exists(self.cache_file):
                with open(self.cache_file, 'r') as f:
                    return json.load(f)
            return {}
        except Exception as e:
            logger.error(f"Error loading summary cache: {e}")
            return {}

    def save_cache(self, cache_data: Dict):
        try:
            with open(self.cache_file, 'w') as f:
                json.dump(cache_data, f)
        except Exception as e:
            logger.error(f"Error saving summary cache: {e}")

    def create_few_shot_prompt(self, article_content: str, category: str) -> str:
        """Create category-specific few-shot prompt."""
        # Get relevant example for the category
        example = self.examples.get(category, self.examples['technology'])
        
        prompt = f"""
        Generate a concise, informative summary of the news article. Focus on key facts and main points.
        Keep the summary clear and engaging. Maintain journalistic style.

        Example Article: {example['article']}
        Example Summary: {example['summary']}

        Article: {article_content}
        Summary:"""
        
        return prompt

    def generate_summary(self, article: Dict) -> str:
        """Generate summary with caching and error handling."""
        try:
            # Check cache first
            cache = self.load_cache()
            content_hash = hash(article['content'])
            
            if str(content_hash) in cache:
                return cache[str(content_hash)]
            
            # Prepare content for summarization
            content = article['content']
            if len(content) > 5000:
                content = content[:2500] + " ... " + content[-2500:]
            
            # Generate summary
            prompt = self.create_few_shot_prompt(content, article['category'])
            response = self.model.generate_content(prompt)
            summary = response.text.strip()
            
            # Clean and format summary
            summary = summary.replace('\n', ' ').strip()
            if len(summary) > self.config['max_summary_length']:
                summary = summary[:self.config['max_summary_length']].rsplit(' ', 1)[0] + '...'
            
            # Cache the result
            cache[str(content_hash)] = summary
            self.save_cache(cache)
            
            return summary
            
        except Exception as e:
            logger.error(f"Error generating summary: {e}")
            # Fallback to extractive summary
            sentences = article['content'].split('.')[:3]
            return '. '.join(sentences) + '...'

    def process_articles(self, articles: List[Dict]) -> List[Dict]:
        """Process all articles and add summaries."""
        logger.info("Generating summaries for articles...")
        for article in tqdm(articles, desc="Generating summaries"):
            if 'summary' not in article:
                article['summary'] = self.generate_summary(article)
        return articles

# Initialize summarizer and process articles
summarizer = NewsSummarizer(generation_model, config)
all_articles = summarizer.process_articles(all_articles)

# Display sample summaries
print("\nSample Summaries:")
for i, article in enumerate(all_articles[:3]):
    print(f"\n{i+1}. {article['title']}")
    print(f"Original length: {len(article['content'])} characters")
    print(f"Summary length: {len(article['summary'])} characters")
    print(f"Summary: {article['summary']}")

Generating summaries: 100%|██████████| 25/25 [00:06<00:00,  3.91it/s]


Sample Summaries:

1. Top 10 AI Tools That Will Transform Your Content Creation in 2025
Original length: 7186 characters
Summary length: 435 characters
Summary: Looking to level up your content creation game in 2025? You're in the right place! The digital landscape has evolved dramatically, and AI tools have become essential for creators who want to stay ahead of the curve.  In this guide, I'll show you the top 10 AI tools that are revolutionizing content creation and making creators' lives easier.  Why You Need These AI Tools in 2025 Content creation has become more demanding than ever...

2. LimeWire AI Studio Review 2023: Details, Pricing & Features
Original length: 9699 characters
Summary length: 415 characters
Summary: In the rapidly advancing landscape of AI technology and innovation, LimeWire emerges as a unique platform in the realm of generative AI tools.  This platform not only stands out from the multitude of existing AI tools but also brings a fresh approach to content gen




In [5]:
# Embeddings System

# Cell 4: Enhanced Embeddings System

class ArticleEmbedder:
    def __init__(self, model_name: str, config: Dict):
        self.model_name = model_name
        self.config = config
        self.cache_file = os.path.join(config['cache_dir'], 'embedding_cache.json')
        self.dimension = config['embedding_dimension']

    def load_cache(self) -> Dict:
        try:
            if os.path.exists(self.cache_file):
                with open(self.cache_file, 'r') as f:
                    cache = json.load(f)
                    # Convert lists back to numpy arrays
                    return {k: np.array(v) for k, v in cache.items()}
            return {}
        except Exception as e:
            logger.error(f"Error loading embedding cache: {e}")
            return {}

    def save_cache(self, cache_data: Dict):
        try:
            # Convert numpy arrays to lists for JSON serialization
            serializable_cache = {k: v.tolist() for k, v in cache_data.items()}
            with open(self.cache_file, 'w') as f:
                json.dump(serializable_cache, f)
        except Exception as e:
            logger.error(f"Error saving embedding cache: {e}")

    def prepare_text_for_embedding(self, text: str) -> str:
        """Prepare text for embedding generation."""
        # Clean and truncate text if necessary
        text = text.replace('\n', ' ').strip()
        if len(text) > self.config['max_text_length']:
            text = text[:self.config['max_text_length']//2] + " ... " + text[-self.config['max_text_length']//2:]
        return text

    def generate_embedding(self, text: str) -> np.ndarray:
        """Generate embedding with error handling."""
        try:
            import google.generativeai as genai
            
            # Prepare text
            text = self.prepare_text_for_embedding(text)
            
            # Generate embedding
            embedding_model = genai.GenerativeModel(self.model_name)
            result = embedding_model.embed_content(text=text)
            
            return np.array(result.embedding)
            
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            # Return random embedding as fallback
            return np.random.randn(self.dimension)

    def get_embedding(self, text: str) -> np.ndarray:
        """Get embedding with caching."""
        # Check cache first
        cache = self.load_cache()
        text_hash = hash(text)
        
        if str(text_hash) in cache:
            return cache[str(text_hash)]
        
        # Generate new embedding
        embedding = self.generate_embedding(text)
        
        # Cache the result
        cache[str(text_hash)] = embedding
        self.save_cache(cache)
        
        return embedding

    def process_articles(self, articles: List[Dict]) -> List[Dict]:
        """Process all articles and add embeddings."""
        logger.info("Generating embeddings for articles...")
        for article in tqdm(articles, desc="Generating embeddings"):
            if 'embedding' not in article:
                # Combine title, summary, and content for better embedding
                combined_text = f"{article['title']} {article.get('summary', '')} {article['content'][:1000]}"
                article['embedding'] = self.get_embedding(combined_text)
        return articles

    def calculate_similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
        """Calculate cosine similarity between embeddings."""
        try:
            dot_product = np.dot(embedding1, embedding2)
            norm1 = np.linalg.norm(embedding1)
            norm2 = np.linalg.norm(embedding2)
            return dot_product / (norm1 * norm2) if norm1 * norm2 != 0 else 0
        except Exception as e:
            logger.error(f"Error calculating similarity: {e}")
            return 0

# Initialize embedder and process articles
embedder = ArticleEmbedder(embedding_model, config)
all_articles = embedder.process_articles(all_articles)

# Demonstrate embedding similarity
print("\nDemonstrating Embedding Similarities:")
if len(all_articles) >= 2:
    article1 = all_articles[0]
    print(f"\nReference Article: {article1['title']}")
    
    # Find similar articles
    similarities = []
    for i, article2 in enumerate(all_articles[1:], 1):
        similarity = embedder.calculate_similarity(article1['embedding'], article2['embedding'])
        similarities.append((i, similarity, article2['title']))
    
    # Display top 3 similar articles
    print("\nMost Similar Articles:")
    for idx, similarity, title in sorted(similarities, key=lambda x: x[1], reverse=True)[:3]:
        print(f"- {title} (Similarity: {similarity:.4f})")

Generating embeddings: 100%|██████████| 25/25 [00:00<00:00, 74.89it/s] 


Demonstrating Embedding Similarities:

Reference Article: Top 10 AI Tools That Will Transform Your Content Creation in 2025

Most Similar Articles:
- 154 million lives and counting: 5 charts reveal the power of vaccines (Similarity: 0.1076)
- Why more AI researchers should collaborate with governments (Similarity: 0.0639)
- Studying seabirds with a cactus as a research assistant (Similarity: 0.0397)





In [6]:
# Add this at the end of Cell 4 or as a new test cell

def test_relevance_examples():
    # Test cases with known relationships
    test_articles = [
        {
            'title': 'OpenAI Releases GPT-5 with Breakthrough Capabilities',
            'content': 'OpenAI has announced the release of GPT-5, featuring unprecedented language understanding and generation capabilities. The new model shows significant improvements in reasoning, coding, and multimodal tasks. Tech industry experts predict major impacts across various sectors.',
            'category': 'technology',
            'summary': 'OpenAI launches GPT-5 with advanced capabilities in reasoning, coding, and multimodal tasks.'
        },
        {
            'title': "Google's New AI Model Challenges GPT-5",
            'content': "Google has unveiled its latest AI model, directly competing with OpenAI's GPT-5. The model demonstrates superior performance in technical tasks and multilingual capabilities. Industry analysts are closely watching this AI arms race.",
            'category': 'technology',
            'summary': 'Google releases new AI model competing with GPT-5, showing strong technical and multilingual capabilities.'
        },
        {
            'title': 'Tech Stocks Surge Following AI Announcements',
            'content': 'Major tech stocks saw significant gains after announcements from OpenAI and Google about their new AI models. Investors are betting big on AI technology companies, with particularly strong performance in chip manufacturers.',
            'category': 'business',
            'summary': 'Tech stocks rise after AI model announcements from OpenAI and Google.'
        },
        {
            'title': 'New Cancer Treatment Shows Promise',
            'content': 'A breakthrough in cancer treatment using targeted immunotherapy has shown remarkable results in clinical trials. The treatment specifically targets aggressive forms of breast cancer.',
            'category': 'health',
            'summary': 'Novel immunotherapy treatment shows promising results for aggressive breast cancer.'
        }
    ]

    # Process test articles
    print("Processing test articles...")
    for article in test_articles:
        combined_text = f"{article['title']} {article['summary']} {article['content']}"
        article['embedding'] = embedder.get_embedding(combined_text)

    # Test different user preferences
    test_preferences = {
        "ai_enthusiast": ["artificial intelligence", "machine learning", "GPT models"],
        "tech_investor": ["tech stocks", "AI companies", "market trends"],
        "healthcare_professional": ["medical research", "cancer treatment", "clinical trials"]
    }

    print("\nRelevance Testing Results:")
    for profile, preferences in test_preferences.items():
        print(f"\n{'-'*50}")
        print(f"User Profile: {profile}")
        print(f"Interests: {', '.join(preferences)}")
        
        # Generate preference embedding
        pref_text = " ".join(preferences)
        pref_embedding = embedder.get_embedding(pref_text)
        
        # Calculate relevance scores
        relevance_scores = []
        for article in test_articles:
            similarity = embedder.calculate_similarity(pref_embedding, article['embedding'])
            relevance_scores.append({
                'title': article['title'],
                'category': article['category'],
                'relevance': similarity
            })
        
        # Sort and display results
        relevance_scores.sort(key=lambda x: x['relevance'], reverse=True)
        print("\nArticle Relevance Ranking:")
        for i, score in enumerate(relevance_scores, 1):
            print(f"\n{i}. {score['title']}")
            print(f"Category: {score['category']}")
            print(f"Relevance Score: {score['relevance']:.4f}")

# Run the test
test_relevance_examples()

# Additional specific comparisons
def compare_specific_articles(articles: List[Dict]):
    print("\nSpecific Article Comparisons:")
    
    # Find technology articles about AI
    ai_articles = [a for a in articles 
                   if a['category'] == 'technology' 
                   and ('AI' in a['title'] or 'artificial intelligence' in a['title'].lower())]
    
    if len(ai_articles) >= 2:
        print("\nComparing AI-related articles:")
        for i, article1 in enumerate(ai_articles):
            for article2 in ai_articles[i+1:]:
                similarity = embedder.calculate_similarity(article1['embedding'], article2['embedding'])
                print(f"\nArticle 1: {article1['title']}")
                print(f"Article 2: {article2['title']}")
                print(f"Similarity Score: {similarity:.4f}")

    # Compare articles across categories
    print("\nCross-category comparisons:")
    categories = ['technology', 'business', 'science', 'health']
    for cat1 in categories:
        for cat2 in categories:
            if cat1 < cat2:  # Avoid duplicate comparisons
                article1 = next((a for a in articles if a['category'] == cat1), None)
                article2 = next((a for a in articles if a['category'] == cat2), None)
                
                if article1 and article2:
                    similarity = embedder.calculate_similarity(article1['embedding'], article2['embedding'])
                    print(f"\n{cat1.capitalize()} vs {cat2.capitalize()}:")
                    print(f"Article 1: {article1['title']}")
                    print(f"Article 2: {article2['title']}")
                    print(f"Similarity Score: {similarity:.4f}")

# Run specific comparisons
compare_specific_articles(all_articles)

Processing test articles...

Relevance Testing Results:

--------------------------------------------------
User Profile: ai_enthusiast
Interests: artificial intelligence, machine learning, GPT models

Article Relevance Ranking:

1. Tech Stocks Surge Following AI Announcements
Category: business
Relevance Score: 0.0226

2. Google's New AI Model Challenges GPT-5
Category: technology
Relevance Score: -0.0004

3. New Cancer Treatment Shows Promise
Category: health
Relevance Score: -0.0129

4. OpenAI Releases GPT-5 with Breakthrough Capabilities
Category: technology
Relevance Score: -0.0299

--------------------------------------------------
User Profile: tech_investor
Interests: tech stocks, AI companies, market trends

Article Relevance Ranking:

1. OpenAI Releases GPT-5 with Breakthrough Capabilities
Category: technology
Relevance Score: 0.0175

2. Google's New AI Model Challenges GPT-5
Category: technology
Relevance Score: 0.0052

3. New Cancer Treatment Shows Promise
Category: health


In [7]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import shutil
import numpy as np
from typing import List, Dict
from tqdm import tqdm
import time

class NewsRAGSystem:
    def __init__(self):
        try:
            # Initialize in-memory ChromaDB client
            self.client = chromadb.Client()
            logger.info("Successfully initialized in-memory ChromaDB client")
            
            # Use a unique collection name with timestamp
            self.collection_name = f"news_articles_{int(time.time())}"
            
            # Create new collection
            self.collection = self.client.create_collection(
                name=self.collection_name,
                metadata={"description": "News articles collection"}
            )
            logger.info(f"Created new collection: {self.collection_name}")
            
            # Define category keywords
            self.category_keywords = {
                'technology': [
                    'artificial intelligence', 'software', 'tech', 'digital', 'computer', 
                    'startup', 'innovation', 'programming', 'developer', 'technology',
                    'AI', 'machine learning', 'data science', 'coding', 'app'
                ],
                'health': [
                    'medical', 'healthcare', 'wellness', 'health', 'disease', 'treatment', 
                    'medicine', 'clinical', 'patient', 'doctor', 'research', 'therapy',
                    'diagnosis', 'hospital', 'care'
                ],
                'business': [
                    'market', 'economic', 'business', 'finance', 'investment', 'strategy', 
                    'company', 'industry', 'trade', 'commercial', 'startup', 'entrepreneur',
                    'revenue', 'growth', 'analysis'
                ],
                'science': [
                    'research', 'scientific', 'discovery', 'experiment', 'study', 
                    'laboratory', 'scientist', 'physics', 'biology', 'chemistry',
                    'innovation', 'technology', 'breakthrough', 'development'
                ]
            }
            
        except Exception as e:
            logger.error(f"Error initializing ChromaDB client: {e}")
            raise

    def add_articles(self, articles: List[Dict]):
        try:
            # Prepare data for insertion
            ids = [str(i) for i in range(len(articles))]
            embeddings = [article['embedding'].tolist() for article in articles]
            metadatas = [{
                "title": article['title'],
                "category": article['category'],
                "date": article.get('date', ''),
                "link": article.get('link', '#'),
                "summary": article.get('summary', '')
            } for article in articles]
            documents = [article['content'] for article in articles]
            
            # Add to collection in batches
            batch_size = 100
            for i in range(0, len(articles), batch_size):
                batch_end = min(i + batch_size, len(articles))
                self.collection.add(
                    ids=ids[i:batch_end],
                    embeddings=embeddings[i:batch_end],
                    metadatas=metadatas[i:batch_end],
                    documents=documents[i:batch_end]
                )
            logger.info(f"Added {len(articles)} articles to the collection")
            
        except Exception as e:
            logger.error(f"Error adding articles to ChromaDB: {e}")
            raise

    def get_personalized_recommendations(self, 
                                       user_preferences: List[str], 
                                       num_results: int = 5) -> List[Dict]:
        try:
            # Expand user preferences with related keywords
            expanded_preferences = []
            for pref in user_preferences:
                expanded_preferences.append(pref)
                # Add related category keywords
                for category, keywords in self.category_keywords.items():
                    if any(keyword in pref.lower() for keyword in keywords):
                        expanded_preferences.extend(keywords)
            
            # Remove duplicates and join
            query = " ".join(list(set(expanded_preferences)))
            
            # Generate embedding for expanded preferences
            query_embedding = embedder.get_embedding(query)
            
            # Query the collection
            results = self.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=min(num_results * 2, self.collection.count()),
                include=["metadatas", "documents", "distances"]
            )
            
            # Format and filter results
            recommendations = []
            if results['distances'] and results['distances'][0]:
                distances = np.array(results['distances'][0])
                
                # Convert distances to similarity scores (0-100 scale)
                max_dist = np.max(distances)
                min_dist = np.min(distances)
                if max_dist == min_dist:
                    similarity_scores = np.ones_like(distances) * 100
                else:
                    similarity_scores = (1 - (distances - min_dist) / (max_dist - min_dist)) * 100
                
                # Filter and sort by relevance
                for i in range(len(results['ids'][0])):
                    metadata = results['metadatas'][0][i]
                    content = results['documents'][0][i]
                    
                    relevance_score = float(similarity_scores[i])
                    
                    # Check if content matches expanded preferences
                    content_lower = content.lower()
                    title_lower = metadata['title'].lower()
                    matches_preferences = any(
                        pref.lower() in content_lower or pref.lower() in title_lower 
                        for pref in expanded_preferences
                    )
                    
                    if matches_preferences:
                        recommendations.append({
                            "title": metadata['title'],
                            "category": metadata['category'],
                            "content": content,
                            "summary": metadata.get('summary', ''),
                            "link": metadata.get('link', '#'),
                            "relevance_score": relevance_score
                        })
            
            # Sort by relevance score and take top results
            recommendations.sort(key=lambda x: x['relevance_score'], reverse=True)
            return recommendations[:num_results]
            
        except Exception as e:
            logger.error(f"Error getting recommendations: {e}")
            return []

# Initialize RAG system
try:
    print("Initializing RAG system...")
    rag_system = NewsRAGSystem()
    
    # Add articles to the system
    print("Adding articles to RAG system...")
    rag_system.add_articles(all_articles)
    
    # Test recommendations
    user_profiles = {
        "tech_enthusiast": ["artificial intelligence", "software development", "tech startups"],
        "health_conscious": ["medical research", "healthcare innovation", "wellness"],
        "business_analyst": ["market trends", "economic policy", "business strategy"]
    }

    print("\nTesting personalized recommendations:")
    for profile, interests in user_profiles.items():
        print(f"\nRecommendations for {profile}:")
        recommendations = rag_system.get_personalized_recommendations(interests)
        for rec in recommendations:
            print(f"- {rec['title']} ({rec['category']}) - Relevance: {rec['relevance_score']:.2f}%")

except Exception as e:
    logger.error(f"Error in RAG system setup: {e}")
    print(f"Failed to initialize RAG system: {str(e)}")

Initializing RAG system...
Adding articles to RAG system...

Testing personalized recommendations:

Recommendations for tech_enthusiast:
- 'Wi-Fi Keeps Going Down': Donald Trump's Return-to-Office Mandate Is Going Terribly (technology) - Relevance: 100.00%
- 14 Best Bookshelf Speakers (2025): Active, Passive, and Hi-Fi (technology) - Relevance: 44.80%
- Scientists Claim to Have Brought Back the Dire Wolf (technology) - Relevance: 42.00%
- China vows ‘fight to the end’ after Trump threatens extra 50% tariff (business) - Relevance: 36.79%
- New WHO guidance calls for urgent transformation of mental health policies (health) - Relevance: 34.95%

Recommendations for health_conscious:
- 14 Best Bookshelf Speakers (2025): Active, Passive, and Hi-Fi (technology) - Relevance: 100.00%
- Beginner Guide to CJ Affiliate (Commission Junction) in 2022 (technology) - Relevance: 66.50%
- Strengthening public health across Lebanon with EIB Global (health) - Relevance: 60.01%
- WHO issues its first-ever 

In [8]:
# Cell 6: Enhanced Interactive Dashboard

from IPython.display import HTML, display
import matplotlib.pyplot as plt
from collections import Counter
import re

class NewsDashboard:
    def __init__(self, rag_system: NewsRAGSystem):
        self.rag_system = rag_system
        
    def create_article_card(self, article: Dict, index: int) -> str:
        # Ensure link is valid
        link = article.get('link', '#')
        if not link.startswith('http'):
            link = f"https://{link}" if link != '#' else '#'
            
        # Format relevance score as percentage
        relevance_percentage = article['relevance_score'] 
        
        return f"""
        <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 5px; background-color: white;">
            <h3>{index}. {article['title']}</h3>
            <div style="margin: 10px 0;">
                <span style="background: #e1e1e1; padding: 5px 10px; border-radius: 15px; margin-right: 10px;">
                    <strong>Category:</strong> {article['category']}
                </span>
                <span style="background: #e1e1e1; padding: 5px 10px; border-radius: 15px;">
                    <strong>Relevance:</strong> {relevance_percentage:.1f}%
                </span>
            </div>
            <p style="color: #666;">{article.get('summary', '')}</p>
            <a href="{link}" target="_blank" style="color: #0366d6; text-decoration: none;">
                Read full article →
            </a>
        </div>
        """

    def create_trending_topics_section(self, articles: List[Dict]) -> str:
        # Extract words from titles and summaries
        all_text = ' '.join([
            a['title'] + ' ' + a.get('summary', '') 
            for a in articles
        ])
        
        # Clean and tokenize text
        words = re.findall(r'\w+', all_text.lower())
        
        # Remove common stop words
        stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
        words = [w for w in words if w not in stop_words and len(w) > 3]
        
        # Get trending topics
        topics = Counter(words).most_common(10)
        
        # Create trending topics HTML
        topics_html = """
        <div style="margin: 20px 0; padding: 15px; background-color: white; border-radius: 5px; border: 1px solid #ddd;">
            <h3 style="margin-bottom: 15px;">📈 Trending Topics</h3>
            <div style="display: flex; flex-wrap: wrap; gap: 10px;">
        """
        
        for word, count in topics:
            topics_html += f"""
            <span style="background: #f0f0f0; padding: 5px 15px; border-radius: 20px; 
            font-size: 14px; color: #444;">
                {word} ({count})
            </span>
            """
            
        topics_html += "</div></div>"
        return topics_html

    def display_personalized_feed(self, user_preferences: List[str]):
        # Get recommendations
        recommendations = self.rag_system.get_personalized_recommendations(user_preferences)
        
        # Create dashboard HTML
        dashboard_html = f"""
        <div style="max-width: 800px; margin: 0 auto; font-family: Arial, sans-serif;">
            <h2 style="color: #333;">Your Personalized News Feed</h2>
            <p style="color: #666;">Based on interests: {', '.join(user_preferences)}</p>
        """
        
        # Add trending topics
        dashboard_html += self.create_trending_topics_section(recommendations)
        
        # Add articles
        dashboard_html += "<div style='margin-top: 20px;'>"
        for i, article in enumerate(recommendations, 1):
            dashboard_html += self.create_article_card(article, i)
        dashboard_html += "</div></div>"
        
        display(HTML(dashboard_html))

# Initialize and display dashboard
print("Initializing News Dashboard...")
dashboard = NewsDashboard(rag_system)

# Display for different user profiles
for profile, interests in user_profiles.items():
    print(f"\nDashboard for {profile}")
    dashboard.display_personalized_feed(interests)

Initializing News Dashboard...

Dashboard for tech_enthusiast



Dashboard for health_conscious



Dashboard for business_analyst


In [9]:
# System Evaluation and Analytics

class NewsSystemEvaluator:
    def __init__(self, generation_model):
        self.model = generation_model
        self.metrics = {
            'summary_quality': [],
            'recommendation_relevance': [],
            'system_performance': {}
        }
        
    def evaluate_summary(self, article: Dict) -> Dict:
        try:
            prompt = f"""
            Evaluate this news summary on the following criteria (score 1-10):
            1. Accuracy: Does it capture the main points?
            2. Conciseness: Is it appropriately brief?
            3. Clarity: Is it easy to understand?

            Original Article (excerpt): {article['content'][:500]}...
            Summary: {article.get('summary', 'No summary available')}

            Format your response as:
            Accuracy Score: [1-10]
            Conciseness Score: [1-10]
            Clarity Score: [1-10]
            Overall Score: [average]
            Feedback: [brief feedback]
            """
            
            response = self.model.generate_content(prompt)
            
            return {
                'article_title': article['title'],
                'evaluation_text': response.text,
                'timestamp': datetime.now().isoformat(),
                'success': True
            }
        except Exception as e:
            logger.error(f"Error evaluating summary: {e}")
            return {
                'article_title': article.get('title', 'Unknown'),
                'evaluation_text': f"Evaluation failed: {str(e)}",
                'timestamp': datetime.now().isoformat(),
                'success': False
            }

    def evaluate_recommendations(self, user_preferences: List[str], 
                               recommendations: List[Dict]) -> Dict:
        try:
            relevance_scores = [rec.get('relevance_score', 0) for rec in recommendations]
            return {
                'average_relevance': np.mean(relevance_scores) if relevance_scores else 0,
                'preference_coverage': len(set(user_preferences)),
                'recommendation_count': len(recommendations),
                'success': True
            }
        except Exception as e:
            logger.error(f"Error evaluating recommendations: {e}")
            return {
                'error': str(e),
                'success': False
            }

    def generate_report(self):
        try:
            # Evaluate summaries
            print("Evaluating summaries...")
            for article in tqdm(all_articles[:5]):  # Evaluate a sample
                evaluation = self.evaluate_summary(article)
                self.metrics['summary_quality'].append(evaluation)

            # Evaluate recommendations
            print("\nEvaluating recommendations...")
            for profile, interests in user_profiles.items():
                recommendations = rag_system.get_personalized_recommendations(interests)
                self.metrics['recommendation_relevance'].append({
                    'profile': profile,
                    'metrics': self.evaluate_recommendations(interests, recommendations)
                })

            # Generate report
            report = """
            # News System Evaluation Report

            ## Summary Quality Analysis
            """
            
            # Add summary evaluations to report
            for i, eval_data in enumerate(self.metrics['summary_quality']):
                report += f"\n### Summary {i+1}: {eval_data['article_title']}\n"
                report += f"Evaluation Results:\n{eval_data['evaluation_text']}\n"
                report += f"Timestamp: {eval_data['timestamp']}\n"
                report += "-" * 50 + "\n"

            # Add recommendation performance to report
            report += "\n## Recommendation Performance Analysis\n"
            for rec_eval in self.metrics['recommendation_relevance']:
                report += f"\n### Profile: {rec_eval['profile']}\n"
                metrics = rec_eval['metrics']
                if metrics.get('success', False):
                    report += f"- Average Relevance: {metrics['average_relevance']:.2f}\n"
                    report += f"- Preference Coverage: {metrics['preference_coverage']}\n"
                    report += f"- Recommendations: {metrics['recommendation_count']}\n"
                else:
                    report += f"- Evaluation failed: {metrics.get('error', 'Unknown error')}\n"

            # Add system performance metrics
            self.metrics['system_performance'] = {
                'total_articles': len(all_articles),
                'evaluation_timestamp': datetime.now().isoformat(),
                'successful_summaries': sum(1 for e in self.metrics['summary_quality'] if e['success']),
                'successful_recommendations': sum(1 for e in self.metrics['recommendation_relevance'] 
                                               if e['metrics'].get('success', False))
            }

            report += "\n## System Performance Metrics\n"
            for key, value in self.metrics['system_performance'].items():
                report += f"- {key}: {value}\n"

            # Display the report
            display(Markdown(report))

            # Save detailed metrics
            self.save_metrics()

        except Exception as e:
            logger.error(f"Error generating report: {e}")
            display(Markdown(f"# Error Generating Report\nAn error occurred: {str(e)}"))

    def save_metrics(self):
        """Save metrics to file with proper error handling"""
        try:
            metrics_file = os.path.join(config['cache_dir'], 'evaluation_metrics.json')
            with open(metrics_file, 'w') as f:
                # Convert numpy values to native Python types
                metrics_dict = json.loads(json.dumps(self.metrics, default=str))
                json.dump(metrics_dict, f, indent=2)
            logger.info(f"Metrics saved to {metrics_file}")
        except Exception as e:
            logger.error(f"Error saving metrics: {e}")

# Run evaluation with proper error handling
try:
    print("Starting system evaluation...")
    evaluator = NewsSystemEvaluator(generation_model)
    evaluator.generate_report()
    print("Evaluation completed successfully!")
except Exception as e:
    logger.error(f"Fatal error in evaluation: {e}")
    print(f"Evaluation failed: {str(e)}")

Starting system evaluation...
Evaluating summaries...


100%|██████████| 5/5 [00:00<00:00,  5.82it/s]



Evaluating recommendations...



            # News System Evaluation Report

            ## Summary Quality Analysis
            
### Summary 1: Top 10 AI Tools That Will Transform Your Content Creation in 2025
Evaluation Results:
Evaluation failed: 404 Gemini 1.0 Pro Vision has been deprecated on July 12, 2024. Consider switching to different model, for example gemini-1.5-flash.
Timestamp: 2025-04-08T03:25:06.374000
--------------------------------------------------

### Summary 2: LimeWire AI Studio Review 2023: Details, Pricing & Features
Evaluation Results:
Evaluation failed: 404 Gemini 1.0 Pro Vision has been deprecated on July 12, 2024. Consider switching to different model, for example gemini-1.5-flash.
Timestamp: 2025-04-08T03:25:06.553858
--------------------------------------------------

### Summary 3: Top 10 AI Tools in 2023 That Will Make Your Life Easier
Evaluation Results:
Evaluation failed: 404 Gemini 1.0 Pro Vision has been deprecated on July 12, 2024. Consider switching to different model, for example gemini-1.5-flash.
Timestamp: 2025-04-08T03:25:06.714360
--------------------------------------------------

### Summary 4: Top 10 AI Content Generator & Writer Tools in 2022
Evaluation Results:
Evaluation failed: 404 Gemini 1.0 Pro Vision has been deprecated on July 12, 2024. Consider switching to different model, for example gemini-1.5-flash.
Timestamp: 2025-04-08T03:25:06.884288
--------------------------------------------------

### Summary 5: Beginner Guide to CJ Affiliate (Commission Junction) in 2022
Evaluation Results:
Evaluation failed: 404 Gemini 1.0 Pro Vision has been deprecated on July 12, 2024. Consider switching to different model, for example gemini-1.5-flash.
Timestamp: 2025-04-08T03:25:07.051171
--------------------------------------------------

## Recommendation Performance Analysis

### Profile: tech_enthusiast
- Average Relevance: 51.71
- Preference Coverage: 3
- Recommendations: 5

### Profile: health_conscious
- Average Relevance: 67.37
- Preference Coverage: 3
- Recommendations: 5

### Profile: business_analyst
- Average Relevance: 60.99
- Preference Coverage: 3
- Recommendations: 5

## System Performance Metrics
- total_articles: 25
- evaluation_timestamp: 2025-04-08T03:25:07.102227
- successful_summaries: 0
- successful_recommendations: 3


Evaluation completed successfully!
