In [11]:
import feedparser as fp
import dateutil.parser
import newspaper
from newspaper import Article
import logging
import pandas as pd
import json
from datetime import datetime

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date

    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

# Usage example
if __name__ == '__main__':
    # Read sources from sources.json file
    try:
        with open('sources.json', 'r') as file:
            sources = json.load(file)
    except Exception as e:
        logging.error(f'Error reading sources.json: {e}')
        raise

    # Get current date
    news_date = datetime.now().strftime('%Y-%m-%d')
    logging.info(f'News date: {news_date}')
    
    scraper = Scraper(sources, news_date)
    try:
        articles = scraper.scrape()
        
        # Check if articles were scraped
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            # Convert the articles list to a Pandas DataFrame
            news_df = pd.DataFrame(articles)
            
            # Display the DataFrame
            logging.info(f'Scraped {len(news_df)} articles.')
            logging.info(news_df)
            
            # Optionally save to a CSV file
            news_df.to_csv('news_articles.csv', index=False)
    except Exception as e:
        logging.error(f'An error occurred during scraping: {e}')


2024-06-24 14:34:42,504 - INFO - News date: 2024-06-24
2024-06-24 14:34:42,506 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-24 14:34:43,049 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-24 14:34:43,050 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-24 14:34:43,050 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-24 14:34:43,051 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-24 14:34:43,052 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-24 14:34:43,053 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-24 14:34:43,054 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-24 14:34:43,055 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-24 14:34:43,055 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-24 14:34:43,056 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-24 14

In [8]:
print(news_df)

NameError: name 'news_df' is not defined

In [29]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode

class Helper:
    @staticmethod
    def clean_dataframe(news_df):
        # Function that cleans the article dataframe
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]  # Keep only titles having more than 3 spaces in the title
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]  # Keep only bodies having more than 20 spaces
        return news_df

    @staticmethod
    def clean_articles(news_df):
        # Function that cleans all the bodies of the articles
        # Drop Duplicates
        news_df = news_df.drop_duplicates(subset=["title", "source"]).sort_index()
        news_df = news_df.drop_duplicates(subset=["body"]).sort_index()
        news_df = news_df.drop_duplicates(subset=["url"]).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        # Make all letters lower case
        news_df['clean_body'] = news_df['body'].str.lower()
        
        # Remove stopwords, punctuation, and digits
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.digits)))
        
        # Remove sources
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = news_df['clean_body'].replace(sources_to_replace, regex=True)
        
        # Unidecode all characters
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        
        # Tokenize
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        # Stem words
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(word) for word in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        return news_df


In [30]:
if __name__ == '__main__':
    # Your existing code for scraping and creating news_df
    
    # Initialize Scraper and scrape articles
    scraper = Scraper(sources, news_date)
    try:
        articles = scraper.scrape()
        
        # Check if articles were scraped
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            # Convert the articles list to a Pandas DataFrame
            news_df = pd.DataFrame(articles)
            
            # Clean the DataFrame using Helper methods
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)
            
            # Display the cleaned DataFrame
            logging.info(news_df)
            
            # Optionally save to a CSV file
            news_df.to_csv('cleaned_news_articles.csv', index=False)
    except Exception as e:
        logging.error(f'An error occurred during scraping: {e}')


2024-06-23 19:21:04,345 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-23 19:21:04,661 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-23 19:21:04,662 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-23 19:21:04,663 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-23 19:21:04,663 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-23 19:21:04,664 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-23 19:21:04,665 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-23 19:21:04,665 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-23 19:21:04,666 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-23 19:21:04,666 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-23 19:21:04,666 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-23 19:21:04,667 - INFO - Found article with date: 2024-06-13

In [31]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import logging
import pandas as pd
import json
from datetime import datetime

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        # Function that cleans the article dataframe
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]  # Keep only titles having more than 3 spaces in the title
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]  # Keep only bodies having more than 20 spaces
        return news_df

    @staticmethod
    def clean_articles(news_df):
        # Function that cleans all the bodies of the articles
        # Drop Duplicates
        news_df = news_df.drop_duplicates(subset=["title", "source"]).sort_index()
        news_df = news_df.drop_duplicates(subset=["body"]).sort_index()
        news_df = news_df.drop_duplicates(subset=["url"]).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        # Make all letters lower case
        news_df['clean_body'] = news_df['body'].str.lower()
        
        # Remove stopwords, punctuation, and digits
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.digits)))
        
        # Remove sources
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = news_df['clean_body'].replace(sources_to_replace, regex=True)
        
        # Unidecode all characters
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        
        # Tokenize
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        # Stem words
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(word) for word in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        return news_df
    
    @staticmethod
    def compute_tfidf(news_df):
        # Function that computes the TFIDF values for all words in the article bodies
        tfidf_df = TfidfVectorizer().fit_transform(news_df['clean_body']).todense()
        return tfidf_df


class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date

    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')


if __name__ == '__main__':
    # Read sources from sources.json file
    with open('sources.json', 'r') as file:
        sources = json.load(file)
    
    # Get current date
    news_date = datetime.now().strftime('%Y-%m-%d')
    
    # Initialize Scraper and scrape articles
    scraper = Scraper(sources, news_date)
    try:
        articles = scraper.scrape()
        
        # Check if articles were scraped
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            # Convert the articles list to a Pandas DataFrame
            news_df = pd.DataFrame(articles)
            
            # Clean the DataFrame using Helper methods
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)
            
            # Compute TF-IDF
            tfidf_matrix = Helper.compute_tfidf(news_df)
            
            # Display the cleaned DataFrame
            logging.info(f'Cleaned DataFrame has {len(news_df)} articles.')
            logging.info(news_df.head())  # Display the first few rows
            
            # Optionally save to a CSV file
            news_df.to_csv('cleaned_news_articles.csv', index=False)
            
            # Verify assertions or checks
            assert 'clean_body' in news_df.columns, "Expected 'clean_body' column is missing."
            assert not news_df['clean_body'].str.contains('[0-9]').any(), "Digits should be removed from 'clean_body'."
            
            # Log TF-IDF results
            logging.info(f'TF-IDF Matrix shape: {tfidf_matrix.shape}')
            
            logging.info('Data cleaning and TF-IDF computation completed successfully.')
            
    except Exception as e:
        logging.error(f'An error occurred during scraping or cleaning: {e}')


2024-06-23 19:31:08,309 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-23 19:31:08,624 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-23 19:31:08,625 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-23 19:31:08,625 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-23 19:31:08,626 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-23 19:31:08,627 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-23 19:31:08,627 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-23 19:31:08,628 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-23 19:31:08,628 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-23 19:31:08,629 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-23 19:31:08,629 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-23 19:31:08,630 - INFO - Found article with date: 2024-06-13

In [32]:
print(tfidf_matrix)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.02853657 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [38]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

class Processer:
    @staticmethod
    def find_clusters(news_df, tfidf_matrix, distance_threshold=1):
        # Function that finds clusters of similar articles within a dataframe of articles
        ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_matrix)
        articles_labeled = ac.labels_
        
        # Prepare clusters dictionary
        clusters = {}
        for label in np.unique(articles_labeled):
            cluster_indices = np.where(articles_labeled == label)[0]
            if len(cluster_indices) > 1:
                clusters[label] = news_df.iloc[cluster_indices].to_dict('records')
        
        return clusters
    
    @staticmethod
    def plot_dendrogram(tfidf_matrix):
        # Function to plot a dendrogram for hierarchical clustering
        # Convert tfidf_matrix to numpy array if it's a matrix
        if isinstance(tfidf_matrix, np.matrix):
            tfidf_matrix = np.asarray(tfidf_matrix)
        
        linkage_matrix = linkage(tfidf_matrix, 'ward')
        plt.figure(figsize=(15, 8))
        dendrogram(linkage_matrix, orientation='top', labels=None, distance_sort='descending', show_leaf_counts=True)
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('Cluster Size')
        plt.ylabel('Distance')
        plt.tight_layout()
        plt.show()


In [39]:
if __name__ == '__main__':
    # Your existing code for scraping, cleaning, and computing TF-IDF
    
    # Initialize Scraper and scrape articles
    scraper = Scraper(sources, news_date)
    try:
        articles = scraper.scrape()
        
        # Check if articles were scraped
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            # Convert the articles list to a Pandas DataFrame
            news_df = pd.DataFrame(articles)
            
            # Clean the DataFrame using Helper methods
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)
            
            # Compute TF-IDF
            tfidf_matrix = Helper.compute_tfidf(news_df)
            
            # Display the cleaned DataFrame
            logging.info(f'Cleaned DataFrame has {len(news_df)} articles.')
            logging.info(news_df.head())  # Display the first few rows
            
            # Optionally save to a CSV file
            news_df.to_csv('cleaned_news_articles.csv', index=False)
            
            # Verify assertions or checks
            assert 'clean_body' in news_df.columns, "Expected 'clean_body' column is missing."
            assert not news_df['clean_body'].str.contains('[0-9]').any(), "Digits should be removed from 'clean_body'."
            
            # Compute clusters
            processor = Processer()
            clusters = processor.find_clusters(news_df, tfidf_matrix)
            
            # Display clusters
            for cluster_id, articles in clusters.items():
                logging.info(f'Cluster {cluster_id}: {len(articles)} articles')
                for article in articles:
                    logging.info(f"- {article['title']} ({article['url']})")
            
            # Plot dendrogram
            logging.info('Plotting dendrogram...')
            processor.plot_dendrogram(tfidf_matrix)
            
            logging.info('Data cleaning, TF-IDF computation, clustering, and dendrogram plotting completed successfully.')
            
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, TF-IDF computation, clustering, or dendrogram plotting: {e}')


2024-06-23 19:45:56,344 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-23 19:45:56,613 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-23 19:45:56,613 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-23 19:45:56,614 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-23 19:45:56,614 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-23 19:45:56,614 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-23 19:45:56,615 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-23 19:45:56,616 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-23 19:45:56,616 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-23 19:45:56,617 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-23 19:45:56,617 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-23 19:45:56,618 - INFO - Found article with date: 2024-06-13

In [43]:
import feedparser as fp
import dateutil.parser
from newspaper import Article
import logging
import pandas as pd
import json
from datetime import datetime
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        # Function that cleans the article dataframe
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]  # keep only titles having more than 3 spaces in the title
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]  # keep only titles having more than 20 spaces in the body
        return news_df
    
    @staticmethod
    def clean_articles(news_df):
        # Function that cleans all the bodies of the articles
        # Drop Duplicates
        news_df = (news_df.drop_duplicates(subset=["title", "source"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["body"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["url"])).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        # Make all letters lower case
        news_df['clean_body'] = news_df['body'].str.lower()
        
        # Filter out the stopwords, punctuation and digits
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        
        # Remove sources
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = (news_df['clean_body'].replace(sources_to_replace, regex=True))
        
        # Unidecode all characters
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        
        # Tokenize
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        # Stem words
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        return news_df

class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date
    
    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

if __name__ == '__main__':
    # Read sources from sources.json file
    with open('sources.json', 'r') as file:
        sources = json.load(file)
    
    # Get current date
    news_date = datetime.now().strftime('%Y-%m-%d')
    scraper = Scraper(sources, news_date)
    try:
        articles = scraper.scrape()
        
        # Check if articles were scraped
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            # Convert the articles list to a Pandas DataFrame
            news_df = pd.DataFrame(articles)
            
            # Clean the DataFrame using Helper methods
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)
            
            # Display the cleaned DataFrame
            logging.info(f'Cleaned DataFrame has {len(news_df)} articles.')
            logging.info(news_df.head())  # Display the first few rows
            
            # Optionally save to a CSV file
            news_df.to_csv('cleaned_news_articles.csv', index=False)
            
            # Verify assertions or checks
            assert 'clean_body' in news_df.columns, "Expected 'clean_body' column is missing."
            assert not news_df['clean_body'].str.contains('[0-9]').any(), "Digits should be removed from 'clean_body'."
            
            logging.info('Data cleaning and TF-IDF computation completed successfully.')
            
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, or TF-IDF computation: {e}')


2024-06-23 20:08:08,982 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-23 20:08:09,457 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-23 20:08:09,458 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-23 20:08:09,459 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-23 20:08:09,460 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-23 20:08:09,461 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-23 20:08:09,462 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-23 20:08:09,462 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-23 20:08:09,464 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-23 20:08:09,465 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-23 20:08:09,466 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-23 20:08:09,466 - INFO - Found article with date: 2024-06-13

In [49]:
import feedparser as fp
import dateutil.parser
from newspaper import Article
import logging
import pandas as pd
import json
from datetime import datetime
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        # Function that cleans the article dataframe
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]  # keep only titles having more than 3 spaces in the title
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]  # keep only titles having more than 20 spaces in the body
        return news_df
    
    @staticmethod
    def clean_articles(news_df):
        # Function that cleans all the bodies of the articles
        # Drop Duplicates
        news_df = (news_df.drop_duplicates(subset=["title", "source"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["body"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["url"])).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        # Make all letters lower case
        news_df['clean_body'] = news_df['body'].str.lower()
        
        # Filter out the stopwords and punctuation
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        
        # Remove digits
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        
        # Log contents of clean_body after removing digits
        logging.info("Contents of 'clean_body' after removing digits:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        # Remove sources
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = (news_df['clean_body'].replace(sources_to_replace, regex=True))
        
        # Unidecode all characters
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        
        # Tokenize
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        # Stem words
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        # Debugging: Print clean_body column to check for digits
        logging.info("Contents of 'clean_body' after cleaning:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        return news_df

class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date
    
    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

def compute_tfidf(news_df):
    # Function that computes the TFIDF values for all words in the article bodies
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())  # Convert to numpy array
    return tfidf_array

def find_featured_clusters(clusters):
    # Function that finds clusters with articles from multiple sources
    featured_clusters = {}
    for i in clusters.keys():
        if len(set([j["source"] for j in clusters[i]])) > 1:
            featured_clusters[i] = clusters[i]
    return featured_clusters

if __name__ == '__main__':
    # Read sources from sources.json file
    with open('sources.json', 'r') as file:
        sources = json.load(file)
    
    # Get current date
    news_date = datetime.now().strftime('%Y-%m-%d')
    scraper = Scraper(sources, news_date)
    try:
        articles = scraper.scrape()
        
        # Check if articles were scraped
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            # Convert the articles list to a Pandas DataFrame
            news_df = pd.DataFrame(articles)
            
            # Clean the DataFrame using Helper methods
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)
            
            # Display the cleaned DataFrame
            logging.info(f'Cleaned DataFrame has {len(news_df)} articles.')
            logging.info(news_df.head())  # Display the first few rows
            
            # Optionally save to a CSV file
            news_df.to_csv('cleaned_news_articles.csv', index=False)
            
            # Verify assertions or checks
            assert 'clean_body' in news_df.columns, "Expected 'clean_body' column is missing."
            if news_df['clean_body'].str.contains('[0-9]').any():
                logging.error("Digits should be removed from 'clean_body'.")
                raise ValueError("Digits should be removed from 'clean_body'.")
            
            # Compute TF-IDF
            tfidf_df = compute_tfidf(news_df)
            
            # Clustering
            distance_threshold = 1
            ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
            articles_labeled = ac.fit_predict(tfidf_df)
            cluster_count = {}
            for label in range(0, len(set(ac.labels_))):
                cluster_count[label] = np.count_nonzero(articles_labeled == label)
            clusters = {}
            for n in range(0, len(cluster_count), 1):
                indexes = np.argwhere(articles_labeled == max(cluster_count, key=cluster_count.get, default=None)).flatten('C').tolist()
                if len(indexes) < 2:
                    break
                else:
                    clusters[n] = []
                    for i in indexes:
                        clusters[n].append(news_df.iloc[i])
                    cluster_count.pop(max(cluster_count, key=cluster_count.get, default=None))
            
            # Find featured clusters
            featured_clusters = find_featured_clusters(clusters)
            logging.info(f'Found {len(featured_clusters)} featured clusters.')
            
            logging.info('Data cleaning, TF-IDF computation, clustering, and feature extraction completed successfully.')
            
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, TF-IDF computation, or clustering: {e}')


2024-06-23 22:52:56,519 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-23 22:52:57,117 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-23 22:52:57,118 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-23 22:52:57,119 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-23 22:52:57,119 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-23 22:52:57,120 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-23 22:52:57,121 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-23 22:52:57,123 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-23 22:52:57,124 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-23 22:52:57,125 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-23 22:52:57,126 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-23 22:52:57,127 - INFO - Found article with date: 2024-06-13

In [51]:
import feedparser as fp
import dateutil.parser
from newspaper import Article
import logging
import pandas as pd
import json
from datetime import datetime
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        # Function that cleans the article dataframe
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]  # keep only titles having more than 3 spaces in the title
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]  # keep only titles having more than 20 spaces in the body
        return news_df
    
    @staticmethod
    def clean_articles(news_df):
        # Function that cleans all the bodies of the articles
        # Drop Duplicates
        news_df = (news_df.drop_duplicates(subset=["title", "source"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["body"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["url"])).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        # Make all letters lower case
        news_df['clean_body'] = news_df['body'].str.lower()
        
        # Filter out the stopwords and punctuation
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        
        # Remove digits
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        
        # Log contents of clean_body after removing digits
        logging.info("Contents of 'clean_body' after removing digits:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        # Remove sources
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = (news_df['clean_body'].replace(sources_to_replace, regex=True))
        
        # Unidecode all characters
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        
        # Tokenize
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        # Stem words
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        # Debugging: Print clean_body column to check for digits
        logging.info("Contents of 'clean_body' after cleaning:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        return news_df

class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date
    
    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

def compute_tfidf(news_df):
    # Function that computes the TFIDF values for all words in the article bodies
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())  # Convert to numpy array
    return tfidf_array

def find_featured_clusters(clusters):
    # Function that finds clusters with articles from multiple sources
    featured_clusters = {}
    for i in clusters.keys():
        if len(set([j["source"] for j in clusters[i]])) > 1:
            featured_clusters[i] = clusters[i]
    return featured_clusters

if __name__ == '__main__':
    # Read sources from sources.json file
    with open('sources.json', 'r') as file:
        sources = json.load(file)
    
    # Get current date
    news_date = datetime.now().strftime('%Y-%m-%d')
    scraper = Scraper(sources, news_date)
    try:
        articles = scraper.scrape()
        
        # Check if articles were scraped
        if not articles:
            logging.warning('No articles were scraped.')
        else:
            # Convert the articles list to a Pandas DataFrame
            news_df = pd.DataFrame(articles)
            
            # Clean the DataFrame using Helper methods
            news_df = Helper.clean_dataframe(news_df)
            news_df = Helper.clean_articles(news_df)
            
            # Display the cleaned DataFrame
            logging.info(f'Cleaned DataFrame has {len(news_df)} articles.')
            logging.info(news_df.head())  # Display the first few rows
            
            # Optionally save to a CSV file
            news_df.to_csv('cleaned_news_articles.csv', index=False)
            
            # Verify assertions or checks
            assert 'clean_body' in news_df.columns, "Expected 'clean_body' column is missing."
            if news_df['clean_body'].str.contains('[0-9]').any():
                logging.error("Digits should be removed from 'clean_body'.")
                raise ValueError("Digits should be removed from 'clean_body'.")
            
            # Compute TF-IDF
            tfidf_df = compute_tfidf(news_df)
            
            # Clustering
            distance_threshold = 1
            ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
            articles_labeled = ac.fit_predict(tfidf_df)
            cluster_count = {}
            for label in range(0, len(set(ac.labels_))):
                cluster_count[label] = np.count_nonzero(articles_labeled == label)
            clusters = {}
            for n in range(0, len(cluster_count), 1):
                indexes = np.argwhere(articles_labeled == max(cluster_count, key=cluster_count.get, default=None)).flatten('C').tolist()
                if len(indexes) < 2:
                    break
                else:
                    clusters[n] = []
                    for i in indexes:
                        clusters[n].append(news_df.iloc[i])
                    cluster_count.pop(max(cluster_count, key=cluster_count.get, default=None))
            
            # Find featured clusters
            featured_clusters = find_featured_clusters(clusters)
            logging.info(f'Found {len(featured_clusters)} featured clusters.')
            
            # Display the clusters and their articles
            for cluster_id, articles in featured_clusters.items():
                logging.info(f'Cluster {cluster_id}:')
                for article in articles:
                    logging.info(f"- {article['title']} ({article['source']})")
    
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, TF-IDF computation, or clustering: {e}')


2024-06-24 05:45:17,168 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss


2024-06-24 05:45:17,465 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-24 05:45:17,467 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-24 05:45:17,468 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-24 05:45:17,469 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-24 05:45:17,469 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-24 05:45:17,470 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-24 05:45:17,471 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-24 05:45:17,472 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-24 05:45:17,474 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-24 05:45:17,474 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-24 05:45:17,475 - INFO - Found article with date: 2024-06-13 18:00:32+00:00
2024-06-24 05:45:17,475 - INFO - Found article with date: 2024-06-13 04:00:1

In [56]:
import feedparser as fp
import dateutil.parser
from newspaper import Article
import logging
import pandas as pd
import json
from datetime import datetime
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import flask
import webbrowser

# Set up logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
import random

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')
    
    @staticmethod
    def clean_dataframe(news_df):
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]
        return news_df
    
    @staticmethod
    def clean_articles(news_df):
        news_df = (news_df.drop_duplicates(subset=["title", "source"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["body"])).sort_index()
        news_df = (news_df.drop_duplicates(subset=["url"])).sort_index()
        news_df = news_df.reset_index(drop=True)
        
        news_df['clean_body'] = news_df['body'].str.lower()
        
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        
        logging.info("Contents of 'clean_body' after removing digits:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        sources_set = [x.lower() for x in set(news_df['source'])]
        sources_to_replace = dict.fromkeys(sources_set, "")
        news_df['clean_body'] = (news_df['clean_body'].replace(sources_to_replace, regex=True))
        
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        
        logging.info("Contents of 'clean_body' after cleaning:")
        for i, body in enumerate(news_df['clean_body'].head(10)):
            logging.info(f'Article {i + 1}: {body}')
        
        return news_df
    
    @staticmethod
    def shuffle_content(clusters_dict):
        for cluster in clusters_dict.values():
            random.shuffle(cluster)
    
    @staticmethod
    def prettify_similar(clusters_dict):
        similar_articles = {}
        for cluster_id, articles in clusters_dict.items():
            similar_articles[cluster_id] = {
                'source': [article['source'] for article in articles],
                'title': [article['title'] for article in articles],
                'url': [article['url'] for article in articles],
                'image_url': [article['image_url'] for article in articles]
            }
        return similar_articles


class Scraper:
    def __init__(self, sources, news_date):
        self.sources = sources
        self.news_date = news_date
    
    def scrape(self):
        # Function that scrapes the content from the URLs in the source data
        try:
            articles_list = []
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if article_date.strftime('%Y-%m-%d') == str(self.news_date):
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    try:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),  # hour, minute, timezone (converted)
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        Helper.print_scrape_status(len(articles_list))
                                    except Exception as e:
                                        logging.error(f'Error processing article: {e}')
                                        logging.info('Continuing...')
                                except Exception as e:
                                    logging.error(f'Error downloading/parsing article: {e}')
                                    logging.info('Continuing...')
            return articles_list
        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

def compute_tfidf(news_df):
    # Function that computes the TFIDF values for all words in the article bodies
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())  # Convert to numpy array
    return tfidf_array

def find_featured_clusters(clusters):
    # Function that finds clusters with articles from multiple sources
    featured_clusters = {}
    for i in clusters.keys():
        if len(set([j["source"] for j in clusters[i]])) > 1:
            featured_clusters[i] = clusters[i]
    return featured_clusters

class Processor:
    @staticmethod
    def find_clusters(news_df, tfidf_df, distance_threshold=1):
        # Function that finds clusters of similar articles within a dataframe of articles
        ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
        articles_labeled = ac.fit_predict(tfidf_df)
        cluster_count = {}
        for label in range(0, len(set(ac.labels_))):
            cluster_count[label] = np.count_nonzero(articles_labeled == label)
        clusters = {}
        for n in range(0, len(cluster_count), 1):
            indexes = np.argwhere(articles_labeled == max(cluster_count, key=cluster_count.get, default=None)).flatten('C').tolist()
            if len(indexes) < 2:
                break
            else:
                clusters[n] = []
                for i in indexes:
                    clusters[n].append(news_df.iloc[i])
                cluster_count.pop(max(cluster_count, key=cluster_count.get, default=None))
        return clusters

import flask
import webbrowser

def build_html(clusters_dict, news_name, news_date, template_filename, output_filename):
    # Initialize Flask app
    newsletter = flask.Flask('newsletter')
    
    # Shuffle and prettify the clusters' content
    Helper.shuffle_content(clusters_dict)
    similar_articles = Helper.prettify_similar(clusters_dict)  
    
    # Load the template file
    with open(template_filename, 'r') as file:
        template_content = file.read()
    
    # Use Flask to render the template with the data
    with newsletter.app_context():
        rendered = flask.render_template_string(template_content,
                                                news_name=news_name,
                                                news_date=news_date,
                                                clusters_dict=clusters_dict,
                                                similar_articles=similar_articles)
    
    # Write the rendered HTML to the output file
    with open(output_filename, 'w', encoding="utf-8") as output:
        output.write(rendered)
    
    # Open the rendered HTML file in a new browser tab
    webbrowser.open_new_tab(output_filename)
    
    return True

if __name__ == "__main__":
    try:
        sources = json.load(open('sources.json'))
        news_date = datetime.strptime('2024-06-23', '%Y-%m-%d').date()
        scraper = Scraper(sources, news_date)
        articles_list = scraper.scrape()

        news_df = pd.DataFrame(articles_list)
        logging.info(f'Scraped {len(news_df)} articles.')
        
        news_df = Helper.clean_dataframe(news_df)
        news_df = Helper.clean_articles(news_df)
        
        tfidf_df = compute_tfidf(news_df)
        
        processor = Processor()
        clusters = processor.find_clusters(news_df, tfidf_df)
        
        featured_clusters = find_featured_clusters(clusters)
        
        for cluster_id, articles in featured_clusters.items():
            logging.info(f'Cluster {cluster_id}:')
            for article in articles:
                logging.info(f"- {article['title']} ({article['source']})")
        
        # Render the HTML newsletter
        build_html(featured_clusters, 'Daily News', news_date, 'newsletter.html', 'output_newsletter.html')
    
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, TF-IDF computation, clustering, or HTML rendering: {e}')


2024-06-24 07:00:30,054 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss
2024-06-24 07:00:30,269 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-24 07:00:30,270 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-24 07:00:30,271 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-24 07:00:30,272 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-24 07:00:30,272 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-24 07:00:30,273 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-24 07:00:30,273 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-24 07:00:30,275 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-24 07:00:30,276 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-24 07:00:30,277 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-24 07:00:30,277 - INFO - Found article with date: 2024-06-13

In [None]:
#--------------------------------------------------------------------------------------------------------------------------------

In [80]:
from datetime import datetime, timedelta
import json
import logging
import pandas as pd
import flask
import webbrowser
import feedparser as fp
from newspaper import Article
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import AgglomerativeClustering

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class Helper:
    @staticmethod
    def print_scrape_status(count):
        logging.info(f'Scraped {count} articles so far...')

    @staticmethod
    def clean_dataframe(news_df):
        news_df = news_df[news_df.title != '']
        news_df = news_df[news_df.body != '']
        news_df = news_df[news_df.image_url != '']
        news_df = news_df[news_df.title.str.count('\s+').ge(3)]
        news_df = news_df[news_df.body.str.count('\s+').ge(20)]
        return news_df

    @staticmethod
    def clean_articles(news_df):
        news_df = news_df.drop_duplicates(subset=["title", "source"]).reset_index(drop=True)
        news_df['clean_body'] = news_df['body'].str.lower()
        stop_words = set(stopwords.words('english'))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
        news_df['clean_body'] = news_df['clean_body'].apply(unidecode)
        news_df['clean_body'] = news_df['clean_body'].apply(word_tokenize)
        stemmer = SnowballStemmer(language='english')
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: [stemmer.stem(y) for y in x])
        news_df['clean_body'] = news_df['clean_body'].apply(lambda x: ' '.join([word for word in x]))
        return news_df

    @staticmethod
    def shuffle_content(clusters_dict):
        for cluster in clusters_dict.values():
            random.shuffle(cluster)

    @staticmethod
    def prettify_similar(clusters_dict):
        similar_articles = {}
        for cluster_id, articles in clusters_dict.items():
            similar_articles[cluster_id] = {
                'source': [article['source'] for article in articles],
                'title': [article['title'] for article in articles],
                'url': [article['url'] for article in articles],
                'image_url': [article['image_url'] for article in articles]
            }
        return similar_articles

class Scraper:
    def __init__(self, sources, news_date_start, news_date_end, keyword):
        self.sources = sources
        self.news_date_start = news_date_start
        self.news_date_end = news_date_end
        self.keyword = keyword.lower()

    def scrape(self):
        try:
            articles_list = []
            articles_count = 0
            for source, content in self.sources.items():
                for url in content['rss']:
                    logging.info(f'Processing RSS feed: {url}')
                    d = fp.parse(url)
                    for entry in d.entries:
                        if hasattr(entry, 'published'):
                            article_date = dateutil.parser.parse(getattr(entry, 'published'))
                            logging.info(f'Found article with date: {article_date}')
                            if self.news_date_start <= article_date.date() <= self.news_date_end:
                                try:
                                    logging.info(f'Processing article: {entry.link}')
                                    content = Article(entry.link)
                                    content.download()
                                    content.parse()
                                    content.nlp()
                                    title = content.title.lower()
                                    body = content.text.lower()
                                    if self.keyword in title or self.keyword in body:
                                        article = {
                                            'source': source,
                                            'url': entry.link,
                                            'date': article_date.strftime('%Y-%m-%d'),
                                            'time': article_date.strftime('%H:%M:%S %Z'),
                                            'title': content.title,
                                            'body': content.text,
                                            'summary': content.summary,
                                            'keywords': content.keywords,
                                            'image_url': content.top_image
                                        }
                                        articles_list.append(article)
                                        articles_count += 1
                                        Helper.print_scrape_status(articles_count)
                                except Exception as e:
                                    logging.error(f'Error processing article: {e}')
                                    logging.info('Continuing...')
            return articles_list

        except Exception as e:
            logging.error(f'Error in "Scraper.scrape()": {e}')
            raise Exception(f'Error in "Scraper.scrape()": {e}')

def compute_tfidf(news_df):
    tfidf_matrix = TfidfVectorizer().fit_transform(news_df['clean_body'])
    tfidf_array = np.asarray(tfidf_matrix.todense())
    return tfidf_array

def find_featured_clusters(clusters):
    featured_clusters = {}
    for i, cluster in clusters.items():
        if len(set([article['source'] for article in cluster])) > 1:
            featured_clusters[i] = cluster
    return featured_clusters

def find_clusters(news_df, tfidf_df, distance_threshold=1):
    if len(news_df) < 2:
        logging.warning('Insufficient articles for clustering. Featuring all articles.')
        return {0: news_df.to_dict('records')}
    
    ac = AgglomerativeClustering(distance_threshold=distance_threshold, n_clusters=None).fit(tfidf_df)
    articles_labeled = ac.fit_predict(tfidf_df)
    
    cluster_count = {}
    for label in range(max(articles_labeled) + 1):
        cluster_count[label] = np.count_nonzero(articles_labeled == label)
    
    clusters = {}
    for n in range(max(cluster_count) + 1):
        indexes = np.argwhere(articles_labeled == n).flatten().tolist()
        clusters[n] = []
        for i in indexes:
            clusters[n].append(news_df.iloc[i].to_dict())
    
    return clusters

def create_summary_table(news_df):
    summary = pd.DataFrame(news_df['source'].value_counts()).reset_index()
    summary.columns = ['Source', 'Number of Articles']
    return summary

if __name__ == "__main__":
    try:
        sources = json.load(open('sources.json'))
        today = datetime.now().date()
        news_date_start = today - timedelta(days=1)  # Scraping articles from the past week
        news_date_end = today
        keyword = 'boeing'
        
        scraper = Scraper(sources, news_date_start, news_date_end, keyword)
        articles_list = scraper.scrape()
        news_df = pd.DataFrame(articles_list)
        
        logging.info(f'Scraped {len(news_df)} articles from {news_date_start} to {news_date_end}.')
        summary_table = create_summary_table(news_df)
        logging.info('\nSummary Table:\n' + summary_table.to_string(index=False))
        
        news_df = Helper.clean_dataframe(news_df)
        news_df = Helper.clean_articles(news_df)
        tfidf_df = compute_tfidf(news_df)

        # Check if enough articles for clustering
        if len(news_df) < 2:
            logging.warning('Insufficient articles for clustering. Featuring all articles.')
            featured_clusters = {0: news_df.to_dict('records')}
        else:
            # Perform clustering
            clusters = find_clusters(news_df, tfidf_df)
            featured_clusters = find_featured_clusters(clusters)

        for cluster_id, articles in featured_clusters.items():
            logging.info(f'Cluster {cluster_id}:')
            for article in articles:
                logging.info(f"- {article['title']} ({article['source']})")

        build_html(featured_clusters, 'Daily News', today, 'newsletter.html', 'output_newsletter.html')
    
    except Exception as e:
        logging.error(f'An error occurred during scraping, cleaning, TF-IDF computation, clustering, or HTML rendering: {e}')


2024-06-24 13:37:13,418 - INFO - Processing RSS feed: http://rss.cnn.com/rss/cnn_latest.rss.html
2024-06-24 13:37:13,770 - INFO - Found article with date: 2024-06-20 13:13:16+00:00
2024-06-24 13:37:13,771 - INFO - Found article with date: 2024-06-19 13:14:44+00:00
2024-06-24 13:37:13,772 - INFO - Found article with date: 2024-06-19 11:47:42+00:00
2024-06-24 13:37:13,773 - INFO - Found article with date: 2024-06-19 00:25:07+00:00
2024-06-24 13:37:13,773 - INFO - Found article with date: 2024-06-18 01:42:44+00:00
2024-06-24 13:37:13,774 - INFO - Found article with date: 2024-06-14 12:48:09+00:00
2024-06-24 13:37:13,775 - INFO - Found article with date: 2024-06-14 03:30:12+00:00
2024-06-24 13:37:13,775 - INFO - Found article with date: 2024-06-14 01:47:04+00:00
2024-06-24 13:37:13,776 - INFO - Found article with date: 2024-06-13 21:20:54+00:00
2024-06-24 13:37:13,777 - INFO - Found article with date: 2024-06-13 20:09:29+00:00
2024-06-24 13:37:13,779 - INFO - Found article with date: 2024-