In [1]:
#  CSC6740 Term Project
#  Georgia State University - Fall 2020
#  William Keith Dobson

#  News story Sentiment analyzer 
#  Loads scraped news feed pickle files then cleans and combines the subject text per news source then computes sentiment and subjectivity
#  of the reporting from each source which is then stored using the same filename with a "Sent-" prefix added

# Library dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import pickle
import datetime 
import re
import string

filelist = [ \
            "news_all_2020-11-02.p", \
            "news_all_2020-11-01.p", \
            "news_all_2020-10-31.p", \
            "news_all_2020-10-30.p", \
            "news_all_2020-10-29.p", \
            "news_all_2020-10-28.p", \
            "news_all_2020-10-27.p", \
            "news_all_2020-10-26.p", \
            "news_all_2020-10-25.p", \
            "news_all_2020-10-24.p", \
            "news_all_2020-10-22.p", \
            "news_all_2020-10-21.p", \
           ]

# Function to display news items library
#  uses flag parameter to limit output 
#  0 = just news sources
#  1 = news source + title
#  2 = news source + title + description
#  3 = news source + title + description + text
def displayNews(news_items, flag):
    for item in news_items:
        print("----"+item['key']+"----")
        if(flag >= 1):
            print("[TITLE] "+item["key"]+" - "+item["title"])
        if(flag >= 2):
            print("[DESC] "+item["description"]) 
        if(flag >= 3 and len(item["text"]) > 100):
            print("[TEXT] "+item["text"])
    return

import copy

# Function to combine the descriptions and text from multiple stories from the same source
# into single text strings.
def combineNews(news_items):
    cnt = 0
    prev_item = copy.deepcopy(news_items[0])
    new_item = copy.deepcopy(news_items[0])
    cnews_items = []
    for item in news_items:
        new_item['key'] = item['key']
        new_item['polarity'] = item['polarity']            
        new_item['subjectivity'] = item['subjectivity'] 
 
        if item['key'] == prev_item['key']:
            new_item['title'] = new_item['title'] + ' '+item['title'].strip()
            new_item['description'] = new_item['description'] + ' '+item['description'].strip() 
            new_item['text'] = new_item['text'] + ' '+item['text'].strip() 
        else:
            new_item['title'] = item['title'].strip()
            new_item['description'] = item['description'].strip() 
            new_item['text'] = item['text'].strip() 
            if cnt > 0 :
                cnews_items.append(prev_item)
            
        prev_item = copy.deepcopy(new_item)
        cnt += 1
        
    cnews_items.append(prev_item)  # get last combined item
    return cnews_items

# Function to convert text string to lower case then remove punctuation and numbers
def cleanPunctNum(text):
    #Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[^A-Za-z0-9 ]+', ' ', text)   # remove any other special characters
    return text


from newspaper import Article    # this is a better tool for capturing article text bodies than beautiful soup

# Function to scrape the story text from a news webpage using newspaper library 
def scrapeText(link):
    test_article = Article(link, language="en")
    test_article.download()
    test_article.parse()
    text = test_article.text

    return text

# Function to pickle news_items list    
def saveNewsItems(news_items, fname):    
    # save the news items
    pickle.dump(news_items, open(fname, "wb"))
    print("==== File:   " + fname + " ... Pickled with "+str(len(news_items))+" stories written.")
    return


In [13]:

# Get user search terms list
inputstring = input("Enter RSS filenames to mine (* = use hardcoded list): ")
fnames = inputstring.split()

if fnames[0] != '*':
    filelist = fnames

# Get user search terms list
inputstring = input("Enter search terms separated by spaces (* for all): ")
searchtermlist = inputstring.split()


Enter RSS filenames to mine (* = use hardcoded list):  news_all_2020-11-03.p
Enter search terms separated by spaces (* for all):  russia


In [14]:
print("Be patient scraping article bodies takes time...")
# Load pickled files and mine data
for fname in filelist:
    news_items = {}
    filehandler = open(fname, 'rb') 
    news_items = pickle.load(filehandler)
    print("====== File: "+fname+" opened for mining ====== ")
    # Now combine story headlines and descriptions from all stories from each source into 
    # a single text string for analysis
    #cnews_items = combineNews(news_items)
    #displayNews(cnews_items, 2)

    # Clean all news items.  Note that only the attributes needed are put into the new data objects.
    cleaned_cnews_items = []
    save_items = []
    for item in news_items:
        for sterm in searchtermlist:
            tmpstr = item['title'].lower()
                
            if sterm == "*" or tmpstr.find(sterm.lower()) > 0:
                if item['key'] != 'telegraph':
                    item['text'] = scrapeText(item['link'])       # scrape article text body referenced by link
                else:
                    item['text'] = " -none- "
                    
                cleaned_item = {}
                cleaned_item['key'] = item['key']
                cleaned_item['title'] = item['title']
                cleaned_item['link'] = item['link']
                cleaned_item['description'] = item['description']
                cleaned_item['text'] = cleanPunctNum(item['text']).strip() 
                cleaned_item['polarity'] = item['polarity']
                cleaned_item['subjectivity'] = item['subjectivity']
                
                if len(item['text']) >= 700 :                 # only save stories with "enough" accessible text
                    save_items.append(item)                   # save copy of the raw uncleaned items that match search criterion
                    cleaned_cnews_items.append(cleaned_item)  # save cleaned text for sentiment mining
                    print("item: "+item['key']+" added len = "+str(len(item['text'])))
                break
        
    fname = "t"+fname   # 't' prefix for story text added 
    for sterm in searchtermlist:
        if sterm != '*':
            sfname = fname.split('all')
            fname = sfname[0]+sterm+sfname[1]       # replaces 'all' with search term in filename   

    saveNewsItems(save_items, fname)    

    # Put cleaned data into a pandas dataframe which works better with sklearn and TextBlob tools.
    pd.set_option('max_colwidth',150)

    cnews_df = pd.DataFrame.from_dict(cleaned_cnews_items) # build cleaned news data frame
    cnews_df = cnews_df.sort_index()
    
    # Create document-term matrix using CountVectorizer that excludes common English stop words
    cv = CountVectorizer(stop_words='english')
    cnews_cv = cv.fit_transform(cnews_df.text)
    cnews_dtm = pd.DataFrame(cnews_cv.toarray(), columns=cv.get_feature_names())
    cnews_dtm.index = cnews_df['key'] #.index
    #print(cnews_dtm) # display word freq matrix
    
    # Create lambda functions to find the polarity and subjectivity for combined news descriptions from each source
    pol = lambda x: TextBlob(x).sentiment.polarity
    sub = lambda x: TextBlob(x).sentiment.subjectivity
    # now mine for polarity and subjectivity
    cnews_df['polarity'] = cnews_df['text'].apply(pol)
    cnews_df['subjectivity'] = cnews_df['text'].apply(sub)
    #print(cnews_df)   # show results
    
    # save sentiment results
    cnews_df.to_pickle("sentdf-"+fname)  #save mined sentiment for later
    print("------ File: "+"sentdf-"+fname+" saved ------")

Be patient scraping article bodies takes time...
item: washpost added len = 2331
item: cbsnews added len = 3760
==== File:   tnews_russia_2020-11-03.p ... Pickled with 2 stories written.
------ File: sentdf-tnews_russia_2020-11-03.p saved ------


In [14]:
cnews_df

Unnamed: 0,key,title,link,description,text,polarity,subjectivity
0,cnbc,"A Biden win could be bullish for the Chinese yuan, analysts say",https://www.cnbc.com/2020/10/21/joe-biden-win-impact-on-trade-war-tariffs-chinese-yuan.html,"A win for Democratic presidential nominee Joe Biden in the U.S. election next month could prove to be bullish for the Chinese yuan, analysts said.",democratic presidential nominee joe biden arrives to speak at a drivein campaign rally at riverside high school on october in durham north carol...,0.134059,0.306882
1,washtimes,'So much work to do': How Biden is planning for transition,https://www.washingtontimes.com/news/2020/oct/21/so-much-work-to-do-how-biden-is-planning-for-trans/?utm_source=RSS_Feed&utm_medium=RSS,"<p>WASHINGTON (AP) - If Joe Biden defeats President Donald Trump next month, he'll quickly face a new challenge: standing up a new administration ...",washington ap if joe biden defeats president donald trump next month hell quickly face a new challenge standing up a new administration to lead a...,0.099283,0.379117
2,bbc,US election 2020: Fact-checking Trump and Biden on the campaign trail,https://www.bbc.co.uk/news/election-us-2020-54512170,Both candidates are travelling to key battleground states in the closing stages of the campaign.,us election factchecking trump and biden on the campaign trail by reality check teambbc news published duration october related topics us electi...,0.034623,0.402557
3,nyt,Trump Campaign’s $63 Million Dwarfed by Biden’s $177 Million,https://www.nytimes.com/2020/10/20/us/politics/trump-money-biden.html,"New financial filings showed the extent of the president’s cash troubles, as he is now badly outmatched by Joe Biden.",recent campaign filings show how much more the trump campaigns joint operation with the republican party is spending to raise its money than mr bi...,0.122937,0.434108
4,nyt,Could Biden Win the Election? Some Democrats Can’t Help Whispering,https://www.nytimes.com/2020/10/21/us/politics/biden-election-landslide.html,Democrats are still haunted by the ghosts of 2016. But some are allowing themselves to contemplate a Biden victory big enough to reorder the natio...,macon ga president trump held a rally in georgia on friday days before the november general election it wasnt a good sign for himthat mr trump ...,0.228,0.460667
5,nyt,"Obama, Biden’s Not-So-Secret Weapon, Will Return to the Campaign Trail",https://www.nytimes.com/live/2020/10/21/us/trump-biden-election,"Former President Barack Obama will make his case for Joe Biden today in Pennsylvania, where President Trump held a rally Tuesday night. Here’s the...",supreme court rules against curbside voting in alabama the fbi says iran and russia have taken actions to interfere in the election president bara...,0.025734,0.392913
6,nyt,Social Media and the Hunter Biden Report,https://www.nytimes.com/2020/10/21/podcasts/the-daily/hunter-biden-new-york-post-twitter-facebook.html,"In trying to insulate their platforms from the spread of dubious information, Facebook, Twitter and YouTube have ignited a different kind of fires...",michael barbarofrom the new york times im michael barbaro this is the daily today the nations biggest social media companies are determined to avo...,0.098202,0.44078
7,nyt,"She’s Evangelical, ‘Pro-Life’ and Voting for Biden",https://www.nytimes.com/2020/10/21/opinion/evangelicals-election-biden.html,"Billy Graham’s granddaughter says, “This president doesn’t represent our faith.”",in one sense duford is an outlier about eight of white evangelicals voted for donald trump in and polling suggests that the great majority will ...,0.11443,0.44228
8,npr,"In Race's Final Days, Biden Campaign Has Big Cash Advantage Over Trump's",https://www.npr.org/2020/10/21/926108447/in-races-final-days-biden-campaign-has-big-cash-advantage-over-trump-s?utm_medium=RSS&utm_campaign=news,"The president's campaign committee finished September with $63.1 million cash on hand, compared with the Biden team's $177.3 million, according to...",in races final days biden campaign has big cash advantage over trumpsenlarge this image toggle caption saul loebafp via getty images saul loebafp ...,0.085212,0.487052
9,npr,Money Tracker: How Much Trump And Biden Have Raised In The 2020 Election,https://www.npr.org/2020/05/20/858347477/money-tracker-how-much-trump-and-biden-have-raised-in-the-2020-election?utm_medium=RSS&utm_campaign=news,"See the latest campaign finance figures for President Trump and his challenger, former Vice President Joe Biden.",money tracker how much trump and biden have raised in the electionloadingupdated on october at pm etwhich presidential candidate has the fundra...,0.197024,0.524405


In [None]:
cnews_dtm.transpose()