In [1]:
import feedparser
from newspaper import Article


In [2]:
import json

In [3]:
# Help in debugging error without breaking execution
import logging
import concurrent.futures # multithereding
logging.basicConfig(filename='news_scraper.log',level=logging.INFO, format= "%(asctime)s-%(levelname)s - %(message)s") 

In [4]:
rss_url = "http://feeds.bbci.co.uk/news/rss.xml"

In [5]:
def fetch_rss_feeds(rss_url):
    try:
        feeds = feedparser.parse(rss_url)
        if not feeds.entries:
            logging.warning("No articles found in RSS feeds.")
        return feeds
    except Exception as e:
        logging.error(f"Error fetching rss feeds:{e}")
        return None

In [6]:
feeds = fetch_rss_feeds(rss_url)
for entry in feeds.entries:
    print(f"Title:{entry.title}")
    print(f"Link:{entry.link}")
    print()

Title:Civil Service told to slash running costs by 15%
Link:https://www.bbc.com/news/articles/cy5nzy403l0o

Title:Pope Francis to be discharged from hospital
Link:https://www.bbc.com/news/articles/crrdv84rg4do

Title:Trump envoy dismisses Starmer plan for Ukraine
Link:https://www.bbc.com/news/articles/c62zm4eqvp7o

Title:Grassroots anger tests Farage's grip on Reform UK
Link:https://www.bbc.com/news/articles/c8x4np7zkx9o

Title:Owners shocked as dogs seized for XL bully checks
Link:https://www.bbc.com/news/articles/cgj5wng9y9lo

Title:The man with a mind-reading chip in his brain - thanks to Elon Musk
Link:https://www.bbc.com/news/articles/cewk49j7j1po

Title:UK TV industry in crisis, says Wolf Hall director
Link:https://www.bbc.com/news/articles/c3w10816en3o

Title:A life spent waiting - and searching rows of unclaimed bodies
Link:https://www.bbc.com/news/articles/c15qyyzz89lo

Title:'My husband is a fighter pilot in Ukraine. Here's how I really feel about a ceasefire'
Link:https://ww

In [7]:
def scrape_article(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return {
            "title": article.title,
            "authors":article.authors,
            "publish_date":str(article.publish_date)if article.publish_date else "unknown",
            "text":article.text[:500]
        }
    except Exception as e:
        logging.error(f"Error scraping article{url}:{e}")
        return None


In [15]:
for entry in feeds.entries[:3]:
    article = scrape_article(entry.link)
    if article: # Ensure article scrapping was successful
        print("="*50)
        print(f"Title :{article.get('title','N/A')}")
        print(f"Authors : {','.join(article.get('authors',[])) if article.get('authors') else 'Unknown'}")
        print(f"Publish Date: {article.get('publish_date','unknown')}")
        print(f"Text: {article.get('text', 'No content available')[:200]}")
        print("="*50)
        print()
    else:
        logging_error(f"failed to scrape article: {entry.link}")

Title :Civil Service told by government to slash running costs by 15%
Authors : Unknown
Publish Date: unknown
Text: Civil Service told to slash running costs by 15%

The changes are part of the government's ongoing spending review, the BBC understands, with Chancellor Rachel Reeves set to deliver her Spring Stateme

Title :Pope Francis to be discharged from hospital on Sunday
Authors : Unknown
Publish Date: unknown
Text: Pope Francis to be discharged from hospital

Pope Francis (file image) has been battling double pneumonia for more than five weeks

Pope Francis was never intubated and always remained alert and orien

Title :Trump envoy Steve Witkoff dismisses Starmer plan for Ukraine
Authors : Unknown
Publish Date: unknown
Text: Trump envoy dismisses Starmer plan for Ukraine

8 hours ago Share Save James Landale • @BBCJLandale Diplomatic correspondent Reporting from Kyiv Share Save

Reuters Sir Keir met a group of military le



In [19]:
# articles_data =[]
# for entry in feeds.entries:
#     article = scrape_article(entry.link)
#     articles_data.append({
#         'title' : article.title,
#         'authors': article.authors,
#         'publish_date' : str(article.publish_date),
#         'text': article.text
#     })
articles_data = []
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:  # Correct spelling
    future_to_url = {executor.submit(scrape_article, entry.link): entry for entry in feeds.entries}  # Fixed typo
    for future in concurrent.futures.as_completed(future_to_url):
        result = future.result()
        if result:
            articles_data.append(result)
    

In [23]:
def save_to_file(data, filename):
    try:
        with open(filename, "w") as f:  # Correct indentation
            json.dump(data, f, indent=4)  # Proper indentation inside 'with' block
        logging.info(f"Data saved successfully to {filename}")
    except Exception as e:  # Fixed 'expect' -> 'except'
        logging.error(f"Error saving data to file: {e}")

In [27]:
save_to_file(articles_data,'articles.json')
print("Data saved to article.json")

Data saved to article.json
