In [2]:
import feedparser as fp
import dateutil.parser
import newspaper
from newspaper import Article
import logging
import pandas as pd
import json
from datetime import datetime

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date

    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            # Check if any articles were scraped
            if articles_list:
                logging.info(f'Total articles scraped: {len(articles_list)}')
                # Print first 3 articles as a sample
                for i, article in enumerate(articles_list[:3]):
                    logging.info(f'Sample article {i+1}: {json.dumps(article, indent=2)}')

                # Save articles to a JSON file for further inspection
                with open('scraped_articles.json', 'w') as f:
                    json.dump(articles_list, f, indent=2)
                logging.info('Scraped articles saved to scraped_articles.json')

                # Convert articles list to DataFrame and save to CSV
                df = pd.DataFrame(articles_list)
                df.to_csv('scraped_articles.csv', index=False)
                logging.info('Scraped articles saved to scraped_articles.csv')
            else:
                logging.warning('No articles were scraped. Check the sources and date provided.')

            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

# Load sources from the JSON file
def load_sources(json_file):
    try:
        with open(json_file, 'r') as f:
            sources = json.load(f)
        return sources
    except Exception as e:
        logging.error(f'Error loading sources from {json_file}: {e}')
        raise

# Example usage:
if __name__ == '__main__':
    sources_file = 'sources.json'
    news_date = '2024-06-11'

    sources = load_sources(sources_file)
    scraper = Scraper(sources, news_date)
    articles = scraper.scrape()


2024-06-24 14:59:59,631 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss


2024-06-24 15:00:00,047 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-24 15:00:00,048 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-24 15:00:00,049 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-24 15:00:00,050 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-24 15:00:00,050 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-24 15:00:00,051 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-24 15:00:00,052 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-24 15:00:00,052 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-24 15:00:00,053 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-24 15:00:00,054 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-24 15:00:00,054 - INFO - Found article with date: 2024-06-13 18:00:32+00:00
2024-06-24 15:00:00,055 - INFO - Found article with date: 2024-06-13 04:00:1