In [11]:
import feedparser as fp
import dateutil.parser
from newspaper import Article
import logging
import pandas as pd
import json
from datetime import datetime
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import flask
import webbrowser
import random


In [12]:

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')
    
    @staticmethod
    def clean_dataframe(news_df):
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]
        return news_df
    
    @staticmethod
    def clean_articles(news_df):
        news_df = (news_df.drop_duplicates(subset=["title", "source"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["body"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["url"])).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        news_df['clean_body'] = news_df['body'].str.lower()
        
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        
        logging.info("Contents of 'clean_body' after removing digits:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = (news_df['clean_body'].replace(sources_to_replace, regex=True))
        
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        logging.info("Contents of 'clean_body' after cleaning:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        return news_df
    
    @staticmethod
    def shuffle_content(clusters_dict):
        for cluster in clusters_dict.values():
            random.shuffle(cluster)
    
    @staticmethod
    def prettify_similar(clusters_dict):
        similar_articles = {}
        for cluster_id, articles in clusters_dict.items():
            similar_articles[cluster_id] = {
                'source': [article['source'] for article in articles],
                'title': [article['title'] for article in articles],
                'url': [article['url'] for article in articles],
                'image_url': [article['image_url'] for article in articles]
            }
        return similar_articles


class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date
    
    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

def compute_tfidf(news_df):
    # Function that computes the TFIDF values for all words in the article bodies
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())  # Convert to numpy array
    return tfidf_array

def find_featured_clusters(clusters):
    # Function that finds clusters with articles from multiple sources
    featured_clusters = {}
    for i in clusters.keys():
        if len(set([j["source"] for j in clusters[i]])) > 1:
            featured_clusters[i] = clusters[i]
    return featured_clusters

class Processor:
    @staticmethod
    def find_clusters(news_df, tfidf_df, distance_threshold=1):
        # Function that finds clusters of similar articles within a dataframe of articles
        ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
        articles_labeled = ac.fit_predict(tfidf_df)
        cluster_count = {}
        for label in range(0, len(set(ac.labels_))):
            cluster_count[label] = np.count_nonzero(articles_labeled == label)
        clusters = {}
        for n in range(0, len(cluster_count), 1):
            indexes = np.argwhere(articles_labeled == max(cluster_count, key=cluster_count.get, default=None)).flatten('C').tolist()
            if len(indexes) < 2:
                break
            else:
                clusters[n] = []
                for i in indexes:
                    clusters[n].append(news_df.iloc[i])
                cluster_count.pop(max(cluster_count, key=cluster_count.get, default=None))
        return clusters

def build_html(clusters_dict, news_name, news_date, template_filename, output_filename):
    # Initialize Flask app
    newsletter = flask.Flask('newsletter')
    
    # Shuffle and prettify the clusters' content
    Helper.shuffle_content(clusters_dict)
    similar_articles = Helper.prettify_similar(clusters_dict)  
    
    # Load the template file
    with open(template_filename, 'r') as file:
        template_content = file.read()
    
    # Use Flask to render the template with the data
    with newsletter.app_context():
        rendered = flask.render_template_string(template_content,
                                                news_name=news_name,
                                                news_date=news_date,
                                                clusters_dict=clusters_dict,
                                                similar_articles=similar_articles)
    
    # Write the rendered HTML to the output file
    with open(output_filename, 'w', encoding="utf-8") as output:
        output.write(rendered)
    
    # Open the rendered HTML file in a new browser tab
    webbrowser.open_new_tab(output_filename)
    
    return True

if __name__ == "__main__":
    try:
        sources = json.load(open('sources.json'))
        news_date = datetime.strptime('2024-06-23', '%Y-%m-%d').date()
        scraper = Scraper(sources, news_date)
        articles_list = scraper.scrape()

        news_df = pd.DataFrame(articles_list)
        logging.info(f'Scraped {len(news_df)} articles.')
        
        news_df = Helper.clean_dataframe(news_df)
        news_df = Helper.clean_articles(news_df)
        
        tfidf_df = compute_tfidf(news_df)
        
        processor = Processor()
        clusters = processor.find_clusters(news_df, tfidf_df)
        
        featured_clusters = find_featured_clusters(clusters)
        
        for cluster_id, articles in featured_clusters.items():
            logging.info(f'Cluster {cluster_id}:')
            for article in articles:
                logging.info(f"- {article['title']} ({article['source']})")
        
        # Render the HTML newsletter
        build_html(featured_clusters, 'Daily News', news_date, 'newsletter.html', 'output_newsletter.html')
    
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, TF-IDF computation, clustering, or HTML rendering: {e}')


  news_df = news_df[news_df.title.str.count('\s+').ge(3)]
  news_df = news_df[news_df.body.str.count('\s+').ge(20)]
2024-06-24 12:58:56,159 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-24 12:58:58,445 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-24 12:58:58,446 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-24 12:58:58,452 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-24 12:58:58,453 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-24 12:58:58,455 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-24 12:58:58,457 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-24 12:58:58,459 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-24 12:58:58,462 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-24 12:58:58,463 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-24 12:58:58,465 - INFO - Fou