In [1]:
import feedparser as fp
import dateutil.parser
from newspaper import Article, Config
import logging
import pandas as pd
import json
from datetime import datetime, timedelta, timezone
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import os
import flask
import webbrowser
import random

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        # Function that cleans the article dataframe
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]  # keep only titles having more than 3 spaces in the title
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]  # keep only titles having more than 20 spaces in the body
        return news_df
    
    @staticmethod
    def clean_articles(news_df):
        # Function that cleans all the bodies of the articles
        # Drop Duplicates
        news_df = (news_df.drop_duplicates(subset=["title", "source"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["body"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["url"])).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        # Make all letters lower case
        news_df['clean_body'] = news_df['body'].str.lower()
        
        # Filter out the stopwords and punctuation
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        
        # Remove digits
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        
        # Log contents of clean_body after removing digits
        logging.info("Contents of 'clean_body' after removing digits:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        # Remove sources
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = (news_df['clean_body'].replace(sources_to_replace, regex=True))
        
        # Unidecode all characters
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        
        # Tokenize
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        # Stem words
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        # Debugging: Print clean_body column to check for digits
        logging.info("Contents of 'clean_body' after cleaning:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        return news_df

    @staticmethod
    def shuffle_content(clusters_dict):
        # Shuffle the content within each cluster
        for cluster in clusters_dict.values():
            random.shuffle(cluster)

    @staticmethod
    def prettify_similar(clusters_dict):
        # Create a list of similar articles for display
        similar_articles = []
        for cluster in clusters_dict.values():
            cluster_titles = [article['title'] for article in cluster]
            similar_articles.append(', '.join(cluster_titles))
        return similar_articles

def compute_tfidf(news_df):
    # Function that computes the TFIDF values for all words in the article bodies
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())  # Convert to numpy array
    return tfidf_array

def find_featured_clusters(clusters):
    # Function that finds clusters with articles from multiple sources
    featured_clusters = {}
    for i in clusters.keys():
        if len(set([j["source"] for j in clusters[i]])) > 1:
            featured_clusters[i] = clusters[i]
    return featured_clusters

# Custom configuration for the newspaper library
config = Config()
custom_tmp_dir = 'C:\\Users\\fredd\\custom_newspaper_tmp'

# Ensure the directory exists
if not os.path.exists(custom_tmp_dir):
    os.makedirs(custom_tmp_dir)

config.fetch_images = False
config.memoize_articles = False
config.request_timeout = 10
config.directory = custom_tmp_dir

class CacheManager:
    def __init__(self, cache_file='article_cache.json'):
        self.cache_file = cache_file
        self.load_cache()
    
    def load_cache(self):
        if os.path.exists(self.cache_file):
            with open(self.cache_file, 'r') as f:
                self.cache = json.load(f)
        else:
            self.cache = {}
    
    def save_cache(self):
        with open(self.cache_file, 'w') as f:
            json.dump(self.cache, f, indent=4)
    
    def get_article(self, url, keyword):
        cached_article = self.cache.get(url, None)
        if cached_article and cached_article.get('contains_keyword', False):
            return cached_article
        return None
    
    def add_article(self, url, article_data, keyword):
        # Check if keyword is present in title or body
        contains_keyword = (keyword in article_data['title'].lower() or keyword in article_data['body'].lower())
        article_data['contains_keyword'] = contains_keyword
        self.cache[url] = article_data
        self.save_cache()

class Scraper:
    def __init__(self, sources, days, keyword, cache_manager):
        self.sources = sources
        self.days = days
        self.keyword = keyword.lower()
        self.cache_manager = cache_manager
    
    def scrape(self):
        try:
            articles_list = []
            now = datetime.now(timezone.utc)
            for source, content in self.sources.items():
                logging.info(f'Source: {source}')
                logging.info(f'Content: {content}')
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    try:
                        d = fp.parse(url)
                    except Exception as e:
                        logging.error(f'Error parsing RSS feed {url}: {e}')
                        continue
                    
                    for entry in d.entries:
                        if not hasattr(entry, 'published'):
                            logging.warning(f'Entry missing "published" attribute: {entry}')
                            continue
                        
                        try:
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            article_date = article_date.astimezone(timezone.utc)  # Ensure article_date is offset-aware
                            logging.info(f'Found article with date: {article_date}')
                        except Exception as e:
                            logging.error(f'Error parsing article date: {e}')
                            continue
                        
                        if now - article_date <= timedelta(days=self.days):
                            cached_article = self.cache_manager.get_article(entry.link, self.keyword)
                            if cached_article:
                                logging.info(f'Using cached article: {entry.link}')
                                # Check if cached article contains keyword
                                if cached_article['contains_keyword']:
                                    articles_list.append(cached_article)
                                    Helper.print_scrape_status(len(articles_list))
                                else:
                                    logging.info(f'Skipping cached article without keyword: {entry.link}')
                                continue
                            
                            try:
                                logging.info(f'Processing article: {entry.link}')
                                content = Article(entry.link, config=config)
                                content.download()
                                content.parse()
                                content.nlp()
                                
                                # Check if keyword is in title or body
                                contains_keyword = (self.keyword in content.title.lower() or self.keyword in content.text.lower())
                                
                                article = {
                                    'source': source,
                                    'url': entry.link,
                                    'date': article_date.strftime('%Y-%m-%d'),
                                    'time': article_date.strftime('%H:%M:%S %Z'),
                                    'title': content.title,
                                    'body': content.text,
                                    'summary': content.summary,
                                    'keywords': content.keywords,
                                    'image_url': content.top_image,
                                    'contains_keyword': contains_keyword
                                }
                                
                                # Add article to cache
                                self.cache_manager.add_article(entry.link, article, self.keyword)
                                
                                # Include article in list if it contains keyword
                                if contains_keyword:
                                    articles_list.append(article)
                                    Helper.print_scrape_status(len(articles_list))
                                
                            except Exception as e:
                                logging.error(f'Error processing article: {e}')
                                logging.info('Continuing...')
                        else:
                            logging.info(f'Skipping article outside date range: {entry.link}')
                            
            return articles_list
        
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

def compute_tfidf(news_df):
    # Function that computes the TFIDF values for all words in the article bodies
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())  # Convert to numpy array
    return tfidf_array

def find_featured_clusters(clusters):
    # Function that finds clusters with articles from multiple sources
    featured_clusters = {}
    for i in clusters.keys():
        if len(set([j["source"] for j in clusters[i]])) > 1:
            featured_clusters[i] = clusters[i]
    return featured_clusters

def build_html(clusters_dict, news_name, news_date, template_string, output_filename):
    app = flask.Flask(__name__)

    with app.app_context():
        rendered = flask.render_template_string(template_string, 
                                                news_name=news_name, 
                                                news_date=news_date, 
                                                clusters_dict=clusters_dict)

    with open(output_filename, 'w', encoding="utf-8") as output:
        output.write(rendered)

    webbrowser.open_new_tab(output_filename)

    return True

if __name__ == '__main__':
    with open('sources.json', 'r') as file:
        sources = json.load(file)
    
    news_date = datetime.now().strftime('%Y-%m-%d')
    days_back = 7  # Specify number of days back you want to scrape
    
    keyword = "Biden"  # Specify your keyword
    
    cache_manager = CacheManager()
    
    scraper = Scraper(sources, days_back, keyword, cache_manager)
    try:
        articles = scraper.scrape()
        
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            news_df = pd.DataFrame(articles)
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)

            news_df.to_csv('cleaned_articles.csv', index=False)
            
            tfidf_df = compute_tfidf(news_df)
            
            distance_threshold = 1
            ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
            articles_labeled = ac.fit_predict(tfidf_df)
            
            clusters = {n: news_df.iloc[np.where(articles_labeled == n)].to_dict(orient='records') for n in np.unique(articles_labeled)}
            
            featured_clusters = find_featured_clusters(clusters)
            
            template_filename = 'template.html'
            output_filename = 'newsletter.html'
            
            with open(template_filename, 'r', encoding='utf-8') as template_file:
                template_string = template_file.read()
            
            build_html(featured_clusters, "Daily News", news_date, template_string, output_filename)
            
    except Exception as e:
        logging.error(f'An error occurred: {e}')


  news_df = news_df[news_df.title.str.count('\s+').ge(3)]  # keep only titles having more than 3 spaces in the title
  news_df = news_df[news_df.body.str.count('\s+').ge(20)]  # keep only titles having more than 20 spaces in the body
2024-06-29 11:57:07,710 - INFO - Source: CNN
2024-06-29 11:57:07,712 - INFO - Content: {'rss': ['http://rss.cnn.com/rss/cnn_latest.rss', 'http://rss.cnn.com/rss/money_latest.rss', 'http://rss.cnn.com/rss/edition_world.rss', 'http://rss.cnn.com/rss/edition.xml'], 'link': ['https://edition.cnn.com/']}
2024-06-29 11:57:07,713 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-29 11:57:08,167 - INFO - Found article with date: 2024-06-28 12:57:05+00:00
2024-06-29 11:57:08,167 - INFO - Processing article: https://www.cnn.com/style/article/kate-moss-glastonbury-fashion-remember-when/index.html
2024-06-29 11:57:09,274 - INFO - Found article with date: 2024-06-28 00:45:41+00:00
2024-06-29 11:57:09,275 - INFO - Processing article: https://ww