In [45]:
#  CSC6740 Term Project
#  Georgia State University - Fall 2020
#  William Keith Dobson
#  Version 1.0
#  News story Sentiment analyzer 
#  Loads scraped RSS news feed pickle files then downloads news article text for stories that include the user 
#  specified search term(s) in its title.  It then cleans each story text and computes sentiment polarity and 
#  subjectivity of the reporting which is stored using the same filename with a "sent-" prefix added

# Library dependencies
import requests
#from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import pickle
import datetime 
import re
import string

filelist = [ \
            "news_all_2020-11-06.p", \
            "news_all_2020-11-05.p", \
            "news_all_2020-11-04.p", \
            "news_all_2020-11-03.p", \
            "news_all_2020-11-02.p", \
            "news_all_2020-11-01.p", \
            "news_all_2020-10-31.p", \
            "news_all_2020-10-30.p", \
            "news_all_2020-10-29.p", \
            "news_all_2020-10-28.p", \
            "news_all_2020-10-27.p", \
            "news_all_2020-10-26.p", \
            "news_all_2020-10-25.p", \
            "news_all_2020-10-24.p", \
            "news_all_2020-10-22.p", \
            "news_all_2020-10-21.p", \
           ]

# Function to display news items library
#  uses flag parameter to limit output 
#  0 = just news sources
#  1 = news source + title
#  2 = news source + title + description
#  3 = news source + title + description + text
def displayNews(news_items, flag):
    for item in news_items:
        print("----"+item['key']+"----")
        if(flag >= 1):
            print("[TITLE] "+item["key"]+" - "+item["title"])
        if(flag >= 2):
            print("[DESC] "+item["description"]) 
        if(flag >= 3 and len(item["text"]) > 100):
            print("[TEXT] "+item["text"])
    return

import copy

# Function to combine the descriptions and text from multiple stories from the same source
# into single text strings.
def combineNews(news_items):
    cnt = 0
    prev_item = copy.deepcopy(news_items[0])
    new_item = copy.deepcopy(news_items[0])
    cnews_items = []
    for item in news_items:
        new_item['key'] = item['key']
        new_item['polarity'] = item['polarity']            
        new_item['subjectivity'] = item['subjectivity'] 
 
        if item['key'] == prev_item['key']:
            new_item['title'] = new_item['title'] + ' '+item['title'].strip()
            new_item['description'] = new_item['description'] + ' '+item['description'].strip() 
            new_item['text'] = new_item['text'] + ' '+item['text'].strip() 
        else:
            new_item['title'] = item['title'].strip()
            new_item['description'] = item['description'].strip() 
            new_item['text'] = item['text'].strip() 
            if cnt > 0 :
                cnews_items.append(prev_item)
            
        prev_item = copy.deepcopy(new_item)
        cnt += 1       
    cnews_items.append(prev_item)  # get last combined item
    return cnews_items

# Function to convert text string to lower case then remove punctuation and numbers
def cleanPunctNum(text):
    #Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[^A-Za-z ]+', ' ', text)   # remove any other special characters
    return text


from newspaper import Article    # this is a better tool for capturing article text bodies than beautiful soup

# Function to scrape the story text from a news webpage using newspaper library 
def scrapeText(link):
    try: 
        test_article = Article(link, language="en", timeout = 10)
        test_article.download()
        test_article.parse()
    except:
        pass
    
    text = test_article.text
    return text

# Function to pickle news_items list    
def saveNewsItems(news_items, fname):    
    # save the news items
    pickle.dump(news_items, open(fname, "wb"))
    print("==== File:   " + fname + " ... Pickled with "+str(len(news_items))+" stories written.")
    return


In [57]:

# Get user search terms list
inputstring = input("Enter RSS filenames to mine (* = use hardcoded list): ")
fnames = inputstring.split()

if fnames[0] != '*':
    filelist = fnames

# Get user search terms list
inputstring = input("Enter second level title search terms separated by spaces (* for all): ")
searchtermlist = inputstring.split()


Enter RSS filenames to mine (* = use hardcoded list):  news_all_2020-11-06.p
Enter second level title search terms separated by spaces (* for all):  election


In [58]:
#  Main Program Body

print("Be patient scraping article bodies takes time...")
save_items = []

# Load pickled files and mine data
for fname in filelist:
    news_items = {}
    filehandler = open(fname, 'rb') 
    news_items = pickle.load(filehandler)
    print("====== File: "+fname+" opened for mining ====== ")
    # Now combine story headlines and descriptions from all stories from each source into 
    # a single text string for analysis
    #cnews_items = combineNews(news_items)
    #displayNews(cnews_items, 2)
    
    ftmp = fname.split('.p')
    filedate = ftmp[0][-10:]

    # Clean all news items.  Note that only the attributes needed are put into the new data objects.
    cleaned_cnews_items = []
    # save_items = []
    for item in news_items:
        for sterm in searchtermlist:
            tmpstr = item['title'].lower()
                
            if sterm == "*" or tmpstr.find(sterm.lower()) > 0:
                if len(item['text']) < 10:      # if article body text not already retrieved get it
                    item['text'] = scrapeText(item['link'])
                    
                item['date'] = filedate                           # added for later merging of stories
                    
                cleaned_item = {}
                cleaned_item['key'] = item['key']
                cleaned_item['date'] = filedate
                cleaned_item['title'] = item['title']
                cleaned_item['link'] = item['link']
                cleaned_item['description'] = item['description']
                cleaned_item['text'] = cleanPunctNum(item['text']).strip() 
                cleaned_item['polarity'] = item['polarity']
                cleaned_item['subjectivity'] = item['subjectivity']
                
                if len(item['text']) >= 700 :                 # only save stories with "enough" accessible text for mining
                    save_items.append(item)                   # save copy of the raw uncleaned items that match search criterion
                    cleaned_cnews_items.append(cleaned_item)  # save cleaned text for sentiment mining
                    print("item: "+item['key']+" added len = "+str(len(item['text'])))
                break
        
    # Put cleaned data into a pandas dataframe which works better with sklearn and TextBlob tools.
    pd.set_option('max_colwidth',150)

    cnews_df = pd.DataFrame.from_dict(cleaned_cnews_items) # build cleaned news data frame
    cnews_df = cnews_df.sort_index()
    
    # Create document-term matrix using CountVectorizer that excludes common English stop words
    cv = CountVectorizer(stop_words='english')
    cnews_cv = cv.fit_transform(cnews_df.text)
    cnews_dtm = pd.DataFrame(cnews_cv.toarray(), columns=cv.get_feature_names())
    cnews_dtm.index = cnews_df['key'] #.index
    #print(cnews_dtm) # display word freq matrix
    
    # Create lambda functions to find the polarity and subjectivity for combined news descriptions from each source
    pol = lambda x: TextBlob(x).sentiment.polarity
    sub = lambda x: TextBlob(x).sentiment.subjectivity
    # now mine for polarity and subjectivity
    cnews_df['polarity'] = cnews_df['text'].apply(pol)
    cnews_df['subjectivity'] = cnews_df['text'].apply(sub)
    #print(cnews_df)   # show results
    
    # save sentiment results with article cleaned text
    cnews_df.to_pickle("sentdf-"+fname)  #save mined sentiment for later
    print("------ File: "+"sentdf-"+fname+" saved ------")
    
ftmp = fname.split('_')     # create save file name with original search and secondary search terms                  
sfname = "snews_"+ftmp[1]
                   
for sterm in searchtermlist:
    if sterm != '*':
        sfname = sfname+"_"+sterm

p = cnews_df.loc[:,'polarity']     # extract mined values from pandas data frame
s = cnews_df.loc[:,'subjectivity']

# copy mining valuess into saved xml structure and save for later analysis
i = 0
for item in save_items:
    item['polarity'] = str(p[i])
    item['subjectivity'] = str(s[i])
    i += 1
    #  print(item['polarity']+" "+item['subjectivity']) #debug
    
        
saveNewsItems(save_items, sfname+"_"+filedate+".p")  # save list of all stories found in filelist  


Be patient scraping article bodies takes time...
item: cnbc added len = 3042
item: cnbc added len = 2055
item: cnbc added len = 1738
item: cnn added len = 8057
item: cnn added len = 3107
item: cnn added len = 4912
item: cnn added len = 7491
item: washtimes added len = 1542
item: washtimes added len = 835
item: washtimes added len = 1444
item: bbc added len = 5962
item: bbc added len = 8315
item: bbc added len = 10062
item: bbc added len = 2924
item: bbc added len = 5482
item: bbc added len = 6432
item: bbc added len = 11316
item: nyt added len = 2558
item: nyt added len = 1990
item: nyt added len = 1035
item: nyt added len = 1165
item: nyt added len = 1122
item: nyt added len = 1038
item: nyt added len = 1576
item: npr added len = 1468
item: washpost added len = 4695
item: washpost added len = 3324
item: washpost added len = 5521
item: washpost added len = 5489
item: nbcnews added len = 3788
item: nbcnews added len = 1849
item: wsj added len = 3788
item: wsj added len = 1849
item: cbsn

In [59]:
cnews_df

Unnamed: 0,key,date,title,link,description,text,polarity,subjectivity
0,cnbc,2020-11-06,"After sharp bounce, market may take a 'breather' on lingering election uncertainty, virus outbreak",https://www.cnbc.com/2020/11/06/after-sharp-bounce-market-may-take-a-breather-on-lingering-election-uncertainty-virus-outbreak.html,"Stocks may trade more cautiously in the week ahead, as investors watch election developments.",spencer platt getty images news getty imagesafter an initially exuberant election reaction stocks may trade more cautiously in the week ahead as...,0.125965,0.406467
1,cnbc,2020-11-06,Asia investors should focus on the region's 'robust' data instead of the U.S. election: Aberdeen,https://www.cnbc.com/2020/11/05/asia-investors-should-focus-on-the-regions-robust-data-aberdeen.html,Aberdeen Standard Investments' Kenneth Akintewe says investors in Asia should focus on the strong data emerging from the region instead of the U.S...,president donald j trump dances at the end of the rally just before he leavessingapore asia investors should focus on the strong data coming out...,0.185391,0.406771
2,cnbc,2020-11-06,"Dow surges more than 500 points, heads for best week since April as post-election rally continues",https://www.cnbc.com/2020/11/04/election-stock-market-futures-open-to-close-news.html,Stocks jumped on Thursday on hopes the winner of the U.S. presidential election would soon be determined.,stocks jumped on thursday on hopes the winner of the us presidential and congressional elections would soon be determined with shares of major tec...,0.147593,0.461975
3,cnn,2020-11-06,Opinion: I'm a Covid-19 survivor. This election hurts,http://rss.cnn.com/~r/rss/cnn_topstories/~3/XR4oS10P2XA/index.html,"Tracking Covid-19 cases in the US<div class=""feedflare"">\n<a href=""http://rss.cnn.com/~ff/rss/cnn_topstories?a=XR4oS10P2XA:J4hFlg-whJY:yIl2AUoC8zA...",elizabeth yuko phd is a bioethicist and writer whose work has appeared in the new york times rolling stone the washington post the atlantic and el...,0.075377,0.486991
4,cnn,2020-11-06,Twitter suspends fake AP account that falsely called the election for Biden,http://rss.cnn.com/~r/rss/cnn_topstories/~3/AnH_GmNEEVo/h_80e159365762afc04fd301b710ba0659,"<div class=""feedflare"">\n<a href=""http://rss.cnn.com/~ff/rss/cnn_topstories?a=AnH_GmNEEVo:1LsoyQ01yYM:yIl2AUoC8zA""><img src=""http://feeds.feedburn...",three of the top hashtags used in twitter posts about the election on thursday promoted unsubstantiated allegations of voter fraud according to ...,0.126758,0.450374
5,cnn,2020-11-06,"In this tight election, this much is clear",http://rss.cnn.com/~r/rss/cnn_topstories/~3/SoBGo0d-Lxw/index.html,Anyone with a healthy skepticism of polling -- and there's been plenty to draw skepticism -- should have rejected the notion that the election wou...,douglas heye is the exdeputy chief of staff to former house majority leader eric cantor a gop strategist and a cnn political commentator follow hi...,0.118583,0.514653
6,cnn,2020-11-06,What's wrong with US elections,http://rss.cnn.com/~r/rss/cnn_topstories/~3/luuPZ5IJMAw/index.html,"Americans' assumptions about elections in the United States do not match the current or historical reality. With few exceptions, American election...",jeremi suri holds the mack brown distinguished chair for leadership in global affairs at the university of texas at austin where he is a professor...,0.030164,0.453341
7,washtimes,2020-11-06,"Heiko Maas, German foreign minister, on U.S. election drama: 'Keep a cool head'",https://www.washingtontimes.com/news/2020/nov/6/heiko-maas-german-foreign-minister-election-one-ma/?utm_source=RSS_Feed&utm_medium=RSS,"<p>German foreign minister Heiko Maas on Friday urged both U.S. presidential campaigns to ""keep a cool head"" as the race to the White House betwee...",german foreign minister heiko maas on friday urged both us presidential campaigns to keep a cool head as the race to the white house between presi...,0.160227,0.374242
8,washtimes,2020-11-06,Tennessee: Record 3 million votes cast in general election,https://www.washingtontimes.com/news/2020/nov/5/tennessee-record-3-million-votes-cast-in-general-e/?utm_source=RSS_Feed&utm_medium=RSS,"<p>NASHVILLE, Tenn. (AP) - State officials say Tennessee has easily broken the 2008 record for voter turnout with more than 3 million ballots cast...",nashville tenn ap state officials say tennessee has easily broken the record for voter turnout with more than million ballots cast in this year...,0.071212,0.365152
9,washtimes,2020-11-06,Kremlin: Unclear election results in the U.S. could negatively impact global economy,https://www.washingtontimes.com/news/2020/nov/5/kremlin-unclear-election-results-us-could-negative/?utm_source=RSS_Feed&utm_medium=RSS,"<p>Skepticism in the U.S. election could have a negative impact on the global economy, Kremlin spokesperson Dmitry Peskov said Thursday.</p> <p>Hi...",skepticism in the us election could have a negative impact on the global economy kremlin spokesperson dmitry peskov said thursdayhis comments come...,-0.04386,0.424561


In [5]:
cnews_dtm.transpose()

key,cnn,cnn.1,cnn.2,cnn.3,cnn.4,cnn.5,cnn.6,bbc,bbc.1,bbc.2,...,nbcnews,nbcnews.1,nbcnews.2,wsj,wsj.1,wsj.2,wsj.3,foxnews,cbsnews,cbsnews.1
aaron,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abandoned,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abc,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
ability,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
able,0,0,1,0,0,1,0,0,0,0,...,0,1,0,1,0,1,0,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zehbrauskas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zero,0,0,0,0,0,0,0,0,2,1,...,0,0,0,0,0,0,0,0,0,0
zhanon,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zone,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
save_items[2]['date']

'2020-11-05'

In [22]:
cnews_df.loc[1, 'polarity']

0.07452956061651714

In [27]:
cnews_df.loc[:,"polarity"].size

12

In [32]:
z = cnews_df.loc[:,'polarity']
for y in z:
    print(y)


0.07307407407407407
0.07452956061651714
0.06610239036325993
0.09343443593443593
0.034623341852257496
0.0853650137741047
0.06827836637047163
0.10740790655884995
0.08537131882202303
0.08537131882202303
0.09800664451827244
0.10895009346713892


In [34]:
save_items[:]['polarity']

TypeError: list indices must be integers or slices, not str

In [35]:
i = 0
for item in save_items:
    item['polarity'] = str(z[i])
    i += 1
    print(item['polarity'])

0.07307407407407407
0.07452956061651714
0.06610239036325993
0.09343443593443593
0.034623341852257496
0.0853650137741047
0.06827836637047163
0.10740790655884995
0.08537131882202303
0.08537131882202303
0.09800664451827244
0.10895009346713892


In [37]:
z = cnews_df.loc[:,'subjectivity']

i = 0
for item in save_items:
    item['subjectivity'] = str(z[i])
    i += 1
    print(item['polarity']+" "+item['subjectivity'])
    


0.07307407407407407 0.4212222222222222
0.07452956061651714 0.5183735909822865
0.06610239036325993 0.33066497270845097
0.09343443593443593 0.46415704665704655
0.034623341852257496 0.40255654653245
0.0853650137741047 0.28240358126721765
0.06827836637047163 0.47867433735854786
0.10740790655884995 0.43248876909254264
0.08537131882202303 0.397800438997622
0.08537131882202303 0.397800438997622
0.09800664451827244 0.353266888150609
0.10895009346713892 0.37922938803620615
