In [2]:
from datetime import datetime, timedelta
import json
import logging
import pandas as pd
import flask
import webbrowser
import feedparser as fp
from newspaper import Article
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import dateutil.parser
import string

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]
        return news_df

    @staticmethod
    def clean_articles(news_df):
        news_df = news_df.drop_duplicates(subset=["title", "source"]).reset_index(drop=True)
        news_df['clean_body'] = news_df['body'].str.lower()
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        return news_df

    @staticmethod
    def shuffle_content(clusters_dict):
        for cluster in clusters_dict.values():
            random.shuffle(cluster)

    @staticmethod
    def prettify_similar(clusters_dict):
        similar_articles = {}
        for cluster_id, articles in clusters_dict.items():
            similar_articles[cluster_id] = {
                'source': [article['source'] for article in articles],
                'title': [article['title'] for article in articles],
                'url': [article['url'] for article in articles],
                'image_url': [article['image_url'] for article in articles]
            }
        return similar_articles

class Scraper:
    def __init__(self, sources, news_date_start, news_date_end, keyword):
        self.sources = sources
        self.news_date_start = news_date_start
        self.news_date_end = news_date_end
        self.keyword = keyword.lower()

    def scrape(self):
        try:
            articles_list = []
            articles_count = 0
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if self.news_date_start <= article_date.date() <= self.news_date_end:
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    title = content.title.lower()
                                    body = content.text.lower()
                                    if self.keyword in title or self.keyword in body:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        articles_count += 1
                                        Helper.print_scrape_status(articles_count)
                                except Exception as e:
                                    logging.error(f'Error processing article: {e}')
                                    logging.info('Continuing...')
            return articles_list

        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

def compute_tfidf(news_df):
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())
    return tfidf_array

def find_featured_clusters(clusters):
    featured_clusters = {}
    for i, cluster in clusters.items():
        if len(set([article['source'] for article in cluster])) > 1:
            featured_clusters[i] = cluster
    return featured_clusters

def find_clusters(news_df, tfidf_df, distance_threshold=1):
    if len(news_df) < 2:
        logging.warning('Insufficient articles for clustering. Featuring all articles.')
        return {0: news_df.to_dict('records')}
    
    ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
    articles_labeled = ac.fit_predict(tfidf_df)
    
    cluster_count = {}
    for label in range(max(articles_labeled) + 1):
        cluster_count[label] = np.count_nonzero(articles_labeled == label)
    
    clusters = {}
    for n in range(max(cluster_count) + 1):
        indexes = np.argwhere(articles_labeled == n).flatten().tolist()
        clusters[n] = []
        for i in indexes:
            clusters[n].append(news_df.iloc[i].to_dict())
    
    return clusters

def create_summary_table(news_df):
    summary = pd.DataFrame(news_df['source'].value_counts()).reset_index()
    summary.columns = ['Source', 'Number of Articles']
    return summary

if __name__ == "__main__":
    try:
        sources = json.load(open('sources.json'))
        today = datetime.now().date()
        news_date_start = today - timedelta(days=7)  # Scraping articles from the past week
        news_date_end = today
        keyword = 'boeing'
        
        scraper = Scraper(sources, news_date_start, news_date_end, keyword)
        articles_list = scraper.scrape()
        news_df = pd.DataFrame(articles_list)
        
        logging.info(f'Scraped {len(news_df)} articles from {news_date_start} to {news_date_end}.')
        summary_table = create_summary_table(news_df)
        logging.info('\nSummary Table:\n' + summary_table.to_string(index=False))
        
        news_df = Helper.clean_dataframe(news_df)
        news_df = Helper.clean_articles(news_df)
        tfidf_df = compute_tfidf(news_df)

        # Check if enough articles for clustering
        if len(news_df) < 2:
            logging.warning('Insufficient articles for clustering. Featuring all articles.')
            featured_clusters = {0: news_df.to_dict('records')}
        else:
            # Perform clustering
            clusters = find_clusters(news_df, tfidf_df)
            featured_clusters = find_featured_clusters(clusters)

        for cluster_id, articles in featured_clusters.items():
            logging.info(f'Cluster {cluster_id}:')
            for article in articles:
                logging.info(f"- {article['title']} ({article['source']})")

        build_html(featured_clusters, 'Daily News', today, 'newsletter.html', 'output_newsletter.html')
    
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, TF-IDF computation, clustering, or HTML rendering: {e}')

  news_df = news_df[news_df.title.str.count('\s+').ge(3)]
  news_df = news_df[news_df.body.str.count('\s+').ge(20)]
2024-06-24 13:16:25,340 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-24 13:16:25,703 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-24 13:16:25,703 - INFO - Processing article: https://www.cnn.com/2023/07/09/health/what-happens-to-your-body-extreme-heat-xpn/index.html
2024-06-24 13:16:27,343 - ERROR - Error processing article: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/sheillapurwandiary/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/nltk_data'
    - '/Library/Framewo