In [9]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                  " Chrome/114.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/"
}

TIMEOUT = 20  # seconds

# The rest of the script stays the same...


In [21]:
!pip install feedparser pandas requests




In [13]:
rss_feeds = [
    {"country": "USA", "source": "CNN", "url": "http://rss.cnn.com/rss/edition.rss"},
    {"country": "UK", "source": "BBC News", "url": "http://feeds.bbci.co.uk/news/rss.xml"},
    {"country": "Japan", "source": "NHK", "url": "https://www3.nhk.or.jp/rss/news/cat0.xml"},
    {"country": "Middle East", "source": "Al Jazeera", "url": "https://www.aljazeera.com/xml/rss/all.xml"},
    {"country": "India", "source": "Times of India", "url": "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"},
    {"country": "Germany", "source": "DW News", "url": "https://rss.dw.com/rdf/rss-en-all"},
    {"country": "Australia", "source": "ABC News", "url": "https://www.abc.net.au/news/feed/51120/rss.xml"},
    {"country": "China", "source": "Xinhua", "url": "http://www.xinhuanet.com/english/rss/worldrss.xml"},
    {"country": "Russia", "source": "RT News", "url": "https://www.rt.com/rss/news/"},
    {"country": "Italy", "source": "ANSA", "url": "https://www.ansa.it/sito/ansait_rss.xml"},
    {"country": "New Zealand", "source": "RNZ", "url": "https://www.rnz.co.nz/rss"},
    {"country": "Malaysia", "source": "The Star", "url": "https://www.thestar.com.my/rss/news_nation"},
    {"country": "South Africa", "source": "News24", "url": "https://feeds.news24.com/articles/news24/TopStories/rss"},
    {"country": "South Korea", "source": "Yonhap News", "url": "https://en.yna.co.kr/RSS/news.xml"},
    
]

In [14]:
import feedparser
import pandas as pd
import requests
from datetime import datetime
import time

all_articles = []

for feed in rss_feeds:
    try:
        response = requests.get(feed['url'], timeout=10)
        response.raise_for_status()
        parsed_feed = feedparser.parse(response.text)

        for entry in parsed_feed.entries:
            article = {
                "title": entry.get("title", "N/A"),
                "published": entry.get("published", "N/A"),
                "source": feed["source"],
                "country": feed["country"],
                "summary": entry.get("summary", "N/A"),
                "url": entry.get("link", "N/A")
            }
            all_articles.append(article)

        print(f"✅ {feed['source']} ({feed['country']}): {len(parsed_feed.entries)} articles")
        time.sleep(1)  # avoid hitting rate limits

    except Exception as e:
        print(f"❌ {feed['source']} ({feed['country']}): {e}")


✅ CNN (USA): 50 articles
✅ BBC News (UK): 37 articles
✅ NHK (Japan): 7 articles
✅ Al Jazeera (Middle East): 25 articles
✅ Times of India (India): 46 articles
✅ DW News (Germany): 147 articles
✅ ABC News (Australia): 25 articles
✅ Xinhua (China): 20 articles
✅ RT News (Russia): 100 articles
✅ ANSA (Italy): 28 articles
✅ RNZ (New Zealand): 0 articles
✅ The Star (Malaysia): 0 articles
✅ News24 (South Africa): 12 articles
✅ Yonhap News (South Korea): 90 articles


In [15]:
df = pd.DataFrame(all_articles)

In [16]:

df.drop_duplicates(subset=['title', 'url'], inplace=True)

In [17]:
df['published'] = pd.to_datetime(df['published'], errors='coerce')

In [18]:
filename = f"news_output_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(filename, index=False, encoding='utf-8')
print(f"📝 Saved {len(df)} articles to {filename}")

📝 Saved 587 articles to news_output_20250521_001550.csv


In [21]:
summary = df.groupby(['country', 'source']).size().reset_index(name='articles_downloaded')

In [22]:
combined_summary = summary.groupby('country').agg({
    'source': lambda x: ', '.join(sorted(x.unique())),
    'articles_downloaded': 'sum'
}).reset_index()

In [23]:
combined_summary['historical_range'] = 'Since 2021'
combined_summary.columns = ['Country', 'News Agencies', 'Total Articles Downloaded', 'Total Historical Data']


In [26]:
print("\n🗞️ Summary Information:\n")
display(combined_summary)


🗞️ Summary Information:



Unnamed: 0,Country,News Agencies,Total Articles Downloaded,Total Historical Data
0,Australia,ABC News,25,Since 2021
1,China,Xinhua,20,Since 2021
2,Germany,DW News,147,Since 2021
3,India,Times of India,46,Since 2021
4,Italy,ANSA,28,Since 2021
5,Japan,NHK,7,Since 2021
6,Middle East,Al Jazeera,25,Since 2021
7,Russia,RT News,100,Since 2021
8,South Africa,News24,12,Since 2021
9,South Korea,Yonhap News,90,Since 2021
