In [34]:
# import packages
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from textblob import TextBlob

In [3]:
# read list of urls
list_good = pd.read_csv("./url_list.csv", names=["url"])

In [4]:
# function to read articles and calculate sentiment
def read_it(url):
    '''
    input must be a url for a CBC COVID article
    custom function based on the format of the read in csv file
    
    returns a list with the article date, content, sentiment, and polarity
    '''
    # read the url and use beautifulsoup4 to scrape
    # the content based on the format of the cbc article
    r = requests.get(url)
    content = r.content
    bs_content = BeautifulSoup(content, 'html')
    all_data = bs_content.find_all('p')
    # read the date
    date_article = bs_content.time['datetime']
    # compile the the paragraphs into a single variable
    list_paragraphs = []
    for p in all_data:
        # the tag em is a bold header note related to the article
        if p.em is None:
            paragraph = p.get_text()
            list_paragraphs.append(paragraph)
            final_article = " ".join(list_paragraphs)
    
    # use textblob to filter out content below "READ MORE"
    # and to perform a sentiment analysis
    initial_blob = TextBlob(final_article)
    end_location = initial_blob.find("READ MORE")
    new_blob = initial_blob[:end_location]
    article_sentiment, article_polarity = new_blob.sentiment
    summary_out = [date_article, url, final_article, article_sentiment, article_polarity]
    return summary_out

In [5]:
# read each article in the list of urls and compile a dataframe
articles_read = []
for value in list_good.iterrows():
    url = value[1]['url']
    output = read_it(url)
    articles_read.append(output)

In [28]:
# create dataframe and sort by datetime
compiled = pd.DataFrame(articles_read, columns=['datetime', 'url', 'content', 'sentiment', 'polarity'])
compiled.sort_values(by='datetime', inplace=True, ignore_index=True)

In [29]:
compiled

Unnamed: 0,datetime,url,content,sentiment,polarity
0,2020-03-11T21:38:53.177Z,https://www.cbc.ca/news/canada/british-columbi...,Provincial health officer Dr. Bonnie Henry ann...,0.109919,0.418379
1,2020-03-13T00:52:12.325Z,https://www.cbc.ca/news/canada/british-columbi...,Provincial health authorities in B.C. are reco...,0.150404,0.430814
2,2020-03-13T23:57:07.711Z,https://www.cbc.ca/news/canada/british-columbi...,"B.C. has detected 11 new cases of COVID-19, br...",0.142584,0.411611
3,2020-03-15T16:25:02.304Z,https://www.cbc.ca/news/canada/british-columbi...,Provincial health officer Dr. Bonnie Henry and...,0.145793,0.419290
4,2020-03-16T02:52:05.705Z,https://www.cbc.ca/news/canada/british-columbi...,The province had 27 coronavirus cases last Sun...,0.149991,0.439431
...,...,...,...,...,...
86,2020-06-26T22:48:12.330Z,https://www.cbc.ca/news/canada/british-columbi...,Provincial Health Officer Dr. Bonnie Henry and...,0.135660,0.407425
87,2020-06-29T23:17:45.598Z,https://www.cbc.ca/news/canada/british-columbi...,"A total of 2,904 cases of COVID-19 have been c...",0.057821,0.426577
88,2020-07-01T00:14:14.976Z,https://www.cbc.ca/news/canada/british-columbi...,Residents living in long-term care and assiste...,0.219659,0.508251
89,2020-07-02T23:21:27.848Z,https://www.cbc.ca/news/canada/british-columbi...,Provincial Health Officer Dr. Bonnie Henry ann...,0.026509,0.465242


In [14]:
# save to a csv
compiled.to_csv("./articles.csv", index=False, header=True)