# CNN News Aggregator
## BHT Workflows project

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import plotly.graph_objects as go
from textblob import TextBlob
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import string
import nltk
import re

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/fiddle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/fiddle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df_articles = pd.DataFrame(columns=['Title', 'Topic', 'Content', "Source", "Polarity score", "Subjectivity score"])

In [4]:
def retrieve_webpage(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

In [5]:
def read_article(article_html):
    title = article_html.find('h1').get_text().replace("\n", "").replace("  ", "")
    texts = article_html.find_all('p', class_='paragraph')
    texts = "".join([text.get_text().replace("  ", "") for text in texts])
    return title, texts

In [6]:
def save_article(title, topic, content, src, pol_score, sbj_score):
    article_dict = [title, topic, content, src, pol_score, sbj_score]
    df_articles.loc[len(df_articles)] = article_dict

In [7]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [8]:
def analyze_sentiment(text):
    blob = TextBlob(text)

    disturbing = [
        "apocalypse", "disaster", "crisis", "catastrophe", "threat",
        "terror", "violence", "fear", "destruction", "chaos", "danger",
        "terrible", "atrocious", "calamity", "tragedy", "carnage", 'murder',
        "war", "menace", "critical"
    ]

    negative = [
        "despair", "suffering", "misery", "anguish", "pain",
        "grief", "sadness", "melancholy", "depression", "anxiety",
        "issue", "problem", "bad", "vulnerable", "sad", "concern",
        "concerning", "worried"
    ]

    neutral = [
        "routine", "commonplace", "ordinary", "standard", "typical",
        "usual", "unremarkable", "mundane", "average", "common", "boring"
    ]

    positive = [
        "hope", "comfort", "relief", "contentment", "satisfaction",
        "joy", "happiness", "love", "excitement", "enthusiasm", "good",
        "agreement", "effort", "happy", "hopeful", "proud"
    ]

    optimistic = [
        "potential", "prosperity", "success", "wonderful", "perfect", 
        "achievement", "growth", "fulfillment", "bliss", "euphoria", "peace",
        "extasis", "successful", "greatness", "incredible"
    ]

    custom_score = 0

    for word in text.split(" "):
        if word in string.punctuation + "”“’–":
            continue
        word = word.lower()
        if word in disturbing:
            custom_score -= 0.1
        elif word in negative:
            custom_score -= 0.05
        elif word in neutral:
            custom_score -= custom_score / 10
        elif word in positive:
            custom_score += 0.05
        elif word in optimistic:
            custom_score += 0.1
    
    sentiment_polarity = np.tanh(10 * (0.25 * blob.sentiment.polarity + custom_score))
    sentiment_subjectivity = blob.sentiment.subjectivity
    
    return sentiment_polarity, sentiment_subjectivity

In [9]:
url = 'https://edition.cnn.com/'

soup = retrieve_webpage(url)

link_date_regex = r"20[0-9][0-9](\/[0-9][0-9]){2}\/"

In [10]:
headline = soup.find('a', class_="container_lead-package__title-url")
link = headline.get_attribute_list("href")[0][1:]
topic = re.sub(link_date_regex, "", link).split("/")[0]
full_link = f'{url}{link}'

topics = {}

if not full_link in df_articles['Source'].values:
    topics = {topic: 1}
    title, content = read_article(retrieve_webpage(full_link))
    polarity_score, subjectivity_score = analyze_sentiment(preprocess_text(content))
    save_article(title, topic, content, full_link, polarity_score, subjectivity_score)

In [11]:
articles = soup.find_all('a', class_='container__link--type-article')

for article in articles:
    link = article.get_attribute_list("href")[0][1:]
    topic = re.sub(link_date_regex, "", link).split("/")[0]
    full_link = f'{url}{link}'
    if not full_link in df_articles['Source'].values:
        if not topic in topics:
            topics[topic] = 1
        else:
            topics[topic] += 1
        title, content = read_article(retrieve_webpage(full_link))
        polarity_score, subjectivity_score = analyze_sentiment(preprocess_text(content))
        save_article(title, topic, content, full_link, polarity_score, subjectivity_score)
    # print(f'Title: {title}\nTopic: {topic}\nLink: {full_link}\n')


In [12]:
df_articles.sort_values("Polarity score", ascending=True).head()

Unnamed: 0,Title,Topic,Content,Source,Polarity score,Subjectivity score
10,Biden tries but fails to turn back rising tide...,politics,\nNo president has ever needed a public holida...,https://edition.cnn.com/2024/07/04/politics/bi...,-1.0,0.421136
18,Fears mount over election-linked violence in F...,europe,\nConcerns are growing about political violenc...,https://edition.cnn.com/2024/07/04/europe/fear...,-1.0,0.412701
42,The ‘world’s largest mammal migration’ happens...,travel,"\nFrom an aerial view, golden-brown specks cov...",https://edition.cnn.com/travel/south-sudan-mig...,-0.999994,0.429573
63,Here’s what voters had to say following the fi...,politics,\nLarry Malinconico went to bed worried about ...,https://edition.cnn.com/2024/06/29/politics/vo...,-0.999988,0.453788
34,This creative capital city deserves a lot more...,travel,\nThis dynamic city on the James River deserve...,https://edition.cnn.com/travel/richmond-virgin...,-0.999956,0.392109


In [13]:
sorted_topics = {k: v for k, v in sorted(topics.items(), key=lambda item: item[1], reverse=True)}

words = list(sorted_topics.keys())
counts = list(sorted_topics.values())

fig = go.Figure(data=[go.Bar(x=words, y=counts, text=counts, textposition='outside')])

fig.update_layout(
    title="Today's topics on CNN front page",
    xaxis_title='Topics',
    yaxis_title='Occurrences',
    template='plotly_dark'
)

fig.show()

In [14]:
topics = df_articles['Topic'].unique()
average_sentiment_scores = []

for topic in topics:
    average_sentiment_scores.append(df_articles[df_articles["Topic"] == topic]["Polarity score"].mean())


fig = go.Figure(data=[go.Bar(x=topics, y=average_sentiment_scores, text=["%.3f" % value for value in average_sentiment_scores], textposition='outside')])
fig.update_layout(
    title="Average sentiment polarity score per topic",
    xaxis_title='Topics',
    yaxis_title='Mean value',
    template='plotly_dark'
)

fig.show()

In [15]:
topics = df_articles['Topic'].unique()
average_sentiment_scores = []


for topic in topics:
    average_sentiment_scores.append(df_articles[df_articles["Topic"] == topic]["Subjectivity score"].mean())


fig = go.Figure(data=[go.Bar(x=topics, y=average_sentiment_scores, text=["%.3f" % value for value in average_sentiment_scores], textposition='outside')])

fig.update_layout(
    title="Average sentiment subjectivity score per topic",
    xaxis_title='Topics',
    yaxis_title='Mean value',
    template='plotly_dark'
)

fig.show()