In [3]:
import feedparser as fp
import dateutil.parser
from newspaper import Article, Config
import logging
import pandas as pd
import json
from datetime import datetime, timedelta, timezone
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import os
import random
from textblob import TextBlob

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        logging.info("Cleaning dataframe")
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count(r'\s+').ge(3)]
        news_df = news_df[news_df.body.str.count(r'\s+').ge(20)]
        return news_df

    @staticmethod
    def clean_articles(news_df):
        logging.info("Cleaning article bodies")
        news_df = (news_df.drop_duplicates(subset=["title", "source"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["body"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["url"])).sort_index()
        news_df = news_df.reset_index(drop=True)

        news_df['clean_body'] = news_df['body'].str.lower()

        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = news_df['clean_body'].replace(sources_to_replace, regex=True)
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)

        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)

        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))

        return news_df

    @staticmethod
    def shuffle_content(clusters_dict):
        logging.info("Shuffling content within clusters")
        for cluster in clusters_dict.values():
            random.shuffle(cluster)

    @staticmethod
    def prettify_similar(clusters_dict):
        logging.info("Creating list of similar articles for display")
        similar_articles = []
        for cluster in clusters_dict.values():
            cluster_titles = [article['title'] for article in cluster]
            similar_articles.append(', '.join(cluster_titles))
        return similar_articles

def sentiment_analysis(articles):
    logging.info("Performing sentiment analysis")
    
    articles_df = pd.DataFrame(articles)
    articles_df['sentiment'] = articles_df['body'].apply(lambda x: TextBlob(x).sentiment.polarity)
    
    def classify_sentiment(polarity):
        if polarity > 0:
            return 'positive'
        elif polarity == 0:
            return 'neutral'
        else:
            return 'bad'
    
    articles_df['sentiment_category'] = articles_df['sentiment'].apply(classify_sentiment)
    
    return articles_df[['url', 'sentiment', 'sentiment_category']]

def compute_tfidf(news_df):
    logging.info("Computing TF-IDF values")
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())
    return tfidf_array

def find_featured_clusters(clusters):
    logging.info("Finding clusters with articles from multiple sources")
    featured_clusters = {}
    for i in clusters.keys():
        if len(set([j["source"] for j in clusters[i]])) > 1:
            featured_clusters[i] = clusters[i]
    return featured_clusters

# Custom configuration for the newspaper library
config = Config()
config.fetch_images = False
config.memoize_articles = False
config.request_timeout = 10

class CacheManager:
    def __init__(self, cache_file='article_cache.json'):
        self.cache_file = cache_file
        self.load_cache()
    
    def load_cache(self):
        logging.info("Loading cache")
        if os.path.exists(self.cache_file):
            with open(self.cache_file, 'r') as f:
                self.cache = json.load(f)
        else:
            self.cache = {}
    
    def save_cache(self):
        logging.info("Saving cache")
        with open(self.cache_file, 'w') as f:
            json.dump(self.cache, f, indent=4)
    
    def get_article(self, url):
        return self.cache.get(url, None)
    
    def add_article(self, url, article_data):
        logging.info(f'Adding article to cache: {url}')
        self.cache[url] = article_data
        self.save_cache()

class Scraper:
    def __init__(self, sources, days, cache_manager):
        self.sources = sources
        self.days = days
        self.cache_manager = cache_manager
    
    def scrape(self):
        try:
            articles_list = []
            now = datetime.now(timezone.utc)
            for source, content in self.sources.items():
                logging.info(f'Source: {source}')
                logging.info(f'Content: {content}')
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    try:
                        d = fp.parse(url)
                    except Exception as e:
                        logging.error(f'Error parsing RSS feed {url}: {e}')
                        continue
                    
                    for entry in d.entries:
                        if not hasattr(entry, 'published'):
                            logging.warning(f'Entry missing "published" attribute: {entry}')
                            continue
                        
                        try:
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            article_date = article_date.astimezone(timezone.utc)
                            logging.info(f'Found article with date: {article_date}')
                        except Exception as e:
                            logging.error(f'Error parsing article date: {e}')
                            continue
                        
                        if now - article_date <= timedelta(days=self.days):
                            cached_article = self.cache_manager.get_article(entry.link)
                            if cached_article:
                                logging.info(f'Using cached article: {entry.link}')
                                articles_list.append(cached_article)
                                Helper.print_scrape_status(len(articles_list))
                                continue
                            
                            try:
                                logging.info(f'Processing article: {entry.link}')
                                content = Article(entry.link, config=config)
                                content.download()
                                content.parse()
                                content.nlp()
                                try:
                                    sentiment = TextBlob(content.text).sentiment.polarity
                                    sentiment_category = 'positive' if sentiment > 0 else 'neutral' if sentiment == 0 else 'bad'
                                    
                                    article = {
                                        'source': source,
                                        'url': entry.link,
                                        'date': article_date.strftime('%Y-%m-%d'),
                                        'time': article_date.strftime('%H:%M:%S %Z'),
                                        'title': content.title,
                                        'body': content.text,
                                        'summary': content.summary,
                                        'keywords': content.keywords,
                                        'image_url': content.top_image,
                                        'sentiment': sentiment,
                                        'sentiment_category': sentiment_category
                                    }
                                    
                                    articles_list.append(article)
                                    Helper.print_scrape_status(len(articles_list))
                                    self.cache_manager.add_article(entry.link, article)
                                except Exception as e:
                                    logging.error(f'Error processing article: {e}')
                                    logging.info('Continuing...')
                            except Exception as e:
                                logging.error(f'Error downloading/parsing article: {e}')
                                logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

if __name__ == '__main__':
    logging.info("Starting main script")
    with open('sources.json', 'r') as file:
        sources = json.load(file)
    
    days_to_scrape = 7
    
    cache_manager = CacheManager()
    
    scraper = Scraper(sources, days_to_scrape, cache_manager)
    try:
        articles = scraper.scrape()
        
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            logging.info(f'{len(articles)} articles scraped.')
            news_df = pd.DataFrame(articles)
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)
            
            sentiment_df = sentiment_analysis(articles)
            
            news_df.drop(columns=['sentiment', 'sentiment_category'], inplace=True, errors='ignore')

            news_df = pd.merge(news_df, sentiment_df[['url', 'sentiment', 'sentiment_category']], on='url')
            
            tfidf_df = compute_tfidf(news_df)
            
            distance_threshold = 1
            ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
            articles_labeled = ac.fit_predict(tfidf_df)
            
            news_df['cluster_id'] = articles_labeled
            
            clusters = {str(n): news_df.iloc[np.where(articles_labeled == n)].to_dict(orient='records') for n in np.unique(articles_labeled)}
            
            featured_clusters = find_featured_clusters(clusters)
            logging.info(f'Featured Clusters: {json.dumps(featured_clusters, indent=4)}')
            
            for article in news_df.to_dict(orient='records'):
                article['contains_keyword'] = any(keyword in article['body'] for keyword in article['keywords'])
                cache_manager.add_article(article['url'], article)
            
    except Exception as e:
        logging.error(f'An error occurred: {e}')


2024-07-03 07:31:46,094 - INFO - Starting main script
2024-07-03 07:31:46,096 - INFO - Loading cache
2024-07-03 07:31:46,154 - INFO - Source: CNN
2024-07-03 07:31:46,155 - INFO - Content: {'rss': ['http://rss.cnn.com/rss/cnn_latest.rss', 'http://rss.cnn.com/rss/money_latest.rss', 'http://rss.cnn.com/rss/edition_world.rss', 'http://rss.cnn.com/rss/edition.xml'], 'link': ['https://edition.cnn.com/']}
2024-07-03 07:31:46,156 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-07-03 07:31:46,619 - INFO - Found article with date: 2024-06-28 12:57:05+00:00
2024-07-03 07:31:46,620 - INFO - Using cached article: https://www.cnn.com/style/article/kate-moss-glastonbury-fashion-remember-when/index.html
2024-07-03 07:31:46,621 - INFO - Scraped 1 articles so far...
2024-07-03 07:31:46,623 - INFO - Found article with date: 2024-06-28 00:45:41+00:00
2024-07-03 07:31:46,624 - INFO - Using cached article: https://www.cnn.com/2023/08/08/us/uvalde-shooter-cousin-arrested-threats-nath