In [5]:
import requests
import feedparser
import pandas as pd
import time
import os

# Set up your API key (Get it from https://newsdata.io/)
NEWSDATA_API_KEY = "pub_75501231b3fc736d4fb28d6002ef36e77f526"

# File to store news
NEWS_CSV = "news_data.csv"

# Load existing news URLs to avoid duplicates
seen_urls = set()

if os.path.exists(NEWS_CSV):
    try:
        existing_news = pd.read_csv(NEWS_CSV)
        seen_urls = set(existing_news["url"].dropna().tolist())
    except Exception as e:
        print(f"⚠️ Error loading existing news data: {e}")

# Function to fetch news from NewsData.io
def fetch_newsdata_news(query="breaking news", country="us", language="en"):
    url = f"https://newsdata.io/api/1/news?apikey={NEWSDATA_API_KEY}&q={query}&country={country}&language={language}"
    response = requests.get(url)
    data = response.json()
    
    articles = []
    if data.get("results"):
        for article in data["results"]:
            url = article.get("link", "#")
            if url not in seen_urls:  # Only add new news
                seen_urls.add(url)
                articles.append({
                    "source": article.get("source_id", "Unknown"),
                    "title": article.get("title", "No title"),
                    "description": article.get("description", "No description"),
                    "url": url,
                    "published_at": article.get("pubDate", "Unknown")
                })
    return articles

# Function to fetch news from RSS Feeds
def fetch_rss_news(feed_url="https://rss.cnn.com/rss/edition.rss"):
    feed = feedparser.parse(feed_url)
    articles = []
    for entry in feed.entries:
        url = entry.link
        if url not in seen_urls:  # Only add new news
            seen_urls.add(url)
            articles.append({
                "source": "CNN RSS",
                "title": entry.title,
                "description": entry.summary,
                "url": url,
                "published_at": entry.published
            })
    return articles

# Combine news from both sources
def fetch_all_news():
    news_data = []
    
    # Fetch from NewsData.io
    news_data.extend(fetch_newsdata_news())

    # Fetch from RSS feed
    news_data.extend(fetch_rss_news())

    return news_data

# Save new news to CSV
def save_news_to_csv(news_data, filename=NEWS_CSV):
    if not news_data:
        print("✅ No new news updates.")
        return
    
    df = pd.DataFrame(news_data)
    
    # Append new data without duplicates
    df.to_csv(filename, mode="a", index=False, header=not os.path.exists(filename))
    print(f"✅ {len(news_data)} new articles added to {filename}")

# Run the script in a loop (Auto-refreshing)
if __name__ == "__main__":
    print("📢 Real-time news updater started...")
    
    while True:
        news = fetch_all_news()
        save_news_to_csv(news)

        print("🔄 Waiting for the next update... (10 sec)")
        time.sleep(10)  # Wait for 10 seconds before fetching again


📢 Real-time news updater started...
✅ 10 new articles added to news_data.csv
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for the next update... (10 sec)
✅ No new news updates.
🔄 Waiting for

KeyboardInterrupt: 