In [32]:
#  CSC6740 Term Project
#  Georgia State University - Fall 2020
#  William Keith Dobson

# Add description

# Library dependencies
import requests
from bs4 import BeautifulSoup
import pickle
import datetime 
import re

# Create dictionary of news rss feeds
# Some of the rss feeds were removed due to embedded href= links in text tags
# until a satisfactory filtering method is found.

rssurls = {'telegraph':'https://www.dailytelegraph.com.au/news/world/rss',\
           'cnbc':'https://www.cnbc.com/id/100727362/device/rss/rss.html',\
           'cnn':'http://rss.cnn.com/rss/cnn_topstories.rss',\
#           'guardian':'http://www.theguardian.com/world/usa/rss',\
#           'aljazeera':'http://www.aljazeera.com/xml/rss/all.xml',\
#           'csmonitor':'https://rss.csmonitor.com/feeds/world',\
#           'cbn':'https://www1.cbn.com/app_feeds/rss/news/rss.php?section=world&mobile=false&q=cbnnews/world/feed',\
           'washtimes':'http://www.washingtontimes.com/rss/headlines/news/world',\
           'bbc':'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml',\
           'nyt':'http://feeds.nytimes.com/nyt/rss/HomePage',\
           'npr':'http://www.npr.org/rss/rss.php?id=1001',\
           'washpost':'http://feeds.washingtonpost.com/rss/world',\
#           'rtnews':'https://www.rt.com/rss/news/',\
           'nbcnews':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'wsj':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'foxnews':'http://feeds.foxnews.com/foxnews/latest?format=xml',\
           'cbsnews':'https://www.cbsnews.com/latest/rss/world'}




In [97]:
# Function to scrape titles and descriptions from a list of rss feeds using a search term list to filter
# results then returns a list of news items.  Note that search algorithm uses OR logic so if any of the 
# search terms are present the story is added to the list

def scrapeFeeds(rssurls, searchtermlist):
    news_items = []   # list for storing all news items    
    for key, url in rssurls.items():
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'xml')
        items = soup.findAll('item')
    
        #print(soup.prettify())   #debug code
    
        # Extract title, description, and link for each news item
        for item in items:
            news_item = {}
            news_item['title'] = item.title.text.strip()
            news_item['description'] = re.sub(r'^https?:\/\/.*[\r\n]*', '', item.description.text, flags=re.MULTILINE).strip() #item.description.text
            news_item['link'] = item.link.text
            news_item['key'] = str(key)  # add key for sorting later
            news_item['polarity'] = str(0)    # add place holders for data mining
            news_item['subjectivity'] = str(0)
            
            for sterm in searchtermlist:
                tmpstr = item.title.text.lower()
                
                if sterm == "*" or tmpstr.find(sterm.lower()) > 0:
                    news_items.append(news_item)
                    break
            
        print("======== "+str(key)+" news items read = "+str(len(items))+" ") #debug report
    
    return news_items

# Function to pickle news_items list    
def saveNewsItems(news_items, fname):    
    # save the news items
    pickle.dump(news_items, open(fname, "wb"))
    print("==== File:   " + fname + " ... Pickled with "+str(len(news_items))+" stories written.")
    return

# Function to display news items library
#  uses flag parameter to limit output 
#  0 = just news sources
#  1 = news source + title
#  2 = news source + title + description
def displayNews(news_items, flag):
    for item in news_items:
        print("----"+item['key']+"----")
        if(flag >= 1):
            print(item["key"]+" - "+item["title"])
        if(flag >= 2):
            print(item["description"]) 
    return

import copy

# Function to combine the description text from multiple stories from the same source
# into single text strings.
def combineNews(news_items):
    cnt = 0
    prev_item = copy.deepcopy(news_items[0])
    new_item = copy.deepcopy(news_items[0])
    cnews_items = []
    for item in news_items:
        new_item['key'] = item['key']
        new_item['polarity'] = item['polarity']            
        new_item['subjectivity'] = item['subjectivity'] 
 
        if item['key'] == prev_item['key']:
            new_item['title'] = new_item['title'] + ' '+item['title'].strip()
            new_item['description'] = new_item['description'] + ' '+item['description'].strip() 
        else:
            new_item['title'] = item['title'].strip()
            new_item['description'] = item['description'].strip() 
            if cnt > 0 :
                cnews_items.append(prev_item)
            
        prev_item = copy.deepcopy(new_item)
        cnt += 1
        
    cnews_items.append(prev_item)  # get last combined item
    return cnews_items

In [98]:
# Get user search terms list
inputstring = input("Enter search terms separated by spaces (* for all): ")
searchtermlist = inputstring.split()


Enter search terms separated by spaces (* for all):  trump


In [99]:
# Main code body

news_items = []   # list for storing all news items

news_items = scrapeFeeds(rssurls, searchtermlist)
    
# use date and search terms to generate unique filename for pickle file
now = datetime.datetime.now()
date_string = now.strftime('%Y-%m-%d')
fname = "news"
for sterm in searchtermlist:
    if sterm != '*':
        fname = fname+"_"+sterm
    else:
        fname = fname+"_all"
        
fname = fname+"_"+date_string+".p"
saveNewsItems(news_items, fname)    

   
    

==== File:   news_trump_2020-10-29.p ... Pickled with 46 stories written.


In [100]:
# Now combine story headlines and descriptions from all stories from each source into 
# a single text string for analysis
cnews_items = combineNews(news_items)
# debug
#cnews_items = news_items
#displayNews(cnews_items, 2)

In [101]:
import re
import string

# Function to convert text string to lower case then remove punctuation and numbers
def cleanPunctNum(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[^A-Za-z0-9 ]+', ' ', text)   # remove any other special characters

    return text

# Clean all combined news items.  Note that only the attributes needed are put into the new data objects.
cleaned_items = []
for item in cnews_items:
    cleaned_item = {}
    cleaned_item['key'] = item['key']
    cleaned_item['text'] = cleanPunctNum(item['description']).strip()
#    cleaned_item['text'] = cleanPunctNum(item['title']+' '+item['description'])
    cleaned_item['polarity'] = item['polarity']
    cleaned_item['subjectivity'] = item['subjectivity']
    cleaned_items.append(cleaned_item)
                                

In [102]:
# Put cleaned and combined data into a pandas dataframe which works better with sklearn and TextBlob tools.
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(cleaned_items) #.transpose()
#data_df.columns = ['text']
data_df = data_df.sort_index()
data_df

Unnamed: 0,key,text,polarity,subjectivity
0,cnbc,transatlantic ties have fractured since president donald trump arrived at the white house but the eu hopes things may improve with the upcoming el...,0,0
1,cnn,shortly after joining the white house as president donald trumps pandemic adviser dr scott atlas launched a quiet effort that seemed counterintuit...,0,0
2,washtimes,ptraverse city mich ap a man charged in an alleged conspiracy to kidnap michigans governor also made threatening online comments about president ...,0,0
3,bbc,the president says mr biden will cancel family gatherings while the democrat pledges to follow science will trump s promise to build a wall along ...,0,0
4,nyt,trailing in the polls president trump and his campaign are pursuing a threepronged strategy that would effectively suppress the mailin vote in the...,0,0
5,npr,president trump promised to end americas opioid crisis on his watch overdose deaths flattened in then surged again to record levels,0,0
6,washpost,by custom most world leaders do not weigh in on us presidential elections wary of alienating one side or damaging strategic interests by appearing...,0,0
7,nbcnews,unbeknownst to hhs cbp had been conducting a family separation pilot that led to hundreds of separations said the report trumps female surrogate...,0,0
8,wsj,unbeknownst to hhs cbp had been conducting a family separation pilot that led to hundreds of separations said the report trumps female surrogate...,0,0
9,foxnews,president trump is the last bastion to stopping the radical left talk show host dave rubin said thursday,0,0


In [103]:
# Create document-term matrix using CountVectorizer that excludes common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_df.text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_df['key'] #.index
data_dtm

Unnamed: 0_level_0,accepted,accusations,accustomed,achievement,activity,administration,adviser,agent,aides,air,...,won,work,workforce,workplaces,world,worsendiv,wrongdiv,year,years,young
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cnbc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cnn,1,0,1,1,0,1,1,0,0,1,...,0,1,1,1,0,1,1,1,0,0
washtimes,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
bbc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
nyt,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
npr,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
washpost,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
nbcnews,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wsj,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
foxnews,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
# Create lambda functions to find the polarity and subjectivity for combined news descriptions from each source
# 
from textblob import TextBlob

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

data_df['polarity'] = data_df['text'].apply(pol)
data_df['subjectivity'] = data_df['text'].apply(sub)
data_df

Unnamed: 0,key,text,polarity,subjectivity
0,cnbc,transatlantic ties have fractured since president donald trump arrived at the white house but the eu hopes things may improve with the upcoming el...,0.0,0.0
1,cnn,shortly after joining the white house as president donald trumps pandemic adviser dr scott atlas launched a quiet effort that seemed counterintuit...,0.016667,0.32619
2,washtimes,ptraverse city mich ap a man charged in an alleged conspiracy to kidnap michigans governor also made threatening online comments about president ...,0.110714,0.296429
3,bbc,the president says mr biden will cancel family gatherings while the democrat pledges to follow science will trump s promise to build a wall along ...,0.202778,0.421667
4,nyt,trailing in the polls president trump and his campaign are pursuing a threepronged strategy that would effectively suppress the mailin vote in the...,0.233636,0.435455
5,npr,president trump promised to end americas opioid crisis on his watch overdose deaths flattened in then surged again to record levels,0.0,0.0
6,washpost,by custom most world leaders do not weigh in on us presidential elections wary of alienating one side or damaging strategic interests by appearing...,-0.1,0.5
7,nbcnews,unbeknownst to hhs cbp had been conducting a family separation pilot that led to hundreds of separations said the report trumps female surrogate...,-0.1,0.183333
8,wsj,unbeknownst to hhs cbp had been conducting a family separation pilot that led to hundreds of separations said the report trumps female surrogate...,-0.1,0.183333
9,foxnews,president trump is the last bastion to stopping the radical left talk show host dave rubin said thursday,0.0,0.033333


In [105]:
# save sentiment results
data_df.to_pickle("Sent-"+fname)  #save mined sentiment for later