In [25]:
#  CSC6740 Term Project
#  Georgia State University - Fall 2020
#  William Keith Dobson

# Add description

# Library dependencies
import requests
from bs4 import BeautifulSoup
import pickle
import datetime 
import re


# Create dictionary of news rss feeds
# Some of the rss feeds were removed due to embedded href= links in text tags
# until a satisfactory filtering method is found.

rssurls = {'telegraph':'https://www.dailytelegraph.com.au/news/world/rss',\
           'cnbc':'https://www.cnbc.com/id/100727362/device/rss/rss.html',\
           'cnn':'http://rss.cnn.com/rss/cnn_topstories.rss',\
#           'guardian':'http://www.theguardian.com/world/usa/rss',\
#           'aljazeera':'http://www.aljazeera.com/xml/rss/all.xml',\
#           'csmonitor':'https://rss.csmonitor.com/feeds/world',\
#           'cbn':'https://www1.cbn.com/app_feeds/rss/news/rss.php?section=world&mobile=false&q=cbnnews/world/feed',\
           'washtimes':'http://www.washingtontimes.com/rss/headlines/news/world',\
           'bbc':'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml',\
           'nyt':'http://feeds.nytimes.com/nyt/rss/HomePage',\
           'npr':'http://www.npr.org/rss/rss.php?id=1001',\
           'washpost':'http://feeds.washingtonpost.com/rss/world',\
#           'rtnews':'https://www.rt.com/rss/news/',\
           'nbcnews':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'wsj':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'foxnews':'http://feeds.foxnews.com/foxnews/latest?format=xml',\
           'cbsnews':'https://www.cbsnews.com/latest/rss/world'}


# Function to scrape titles and descriptions from a list of rss feeds using a search term list to filter
# results then returns a list of news items.  Note that search algorithm uses OR logic so if any of the 
# search terms are present the story is added to the list

def scrapeFeeds(rssurls, searchtermlist):
    news_items = []   # list for storing all news items    
    for key, url in rssurls.items():
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'xml')
        items = soup.findAll('item')
    
        #print(soup.prettify())   #debug code
    
        # Extract title, description, and link for each news item
        for item in items:
            news_item = {}
            news_item['title'] = item.title.text.strip()
            news_item['description'] = re.sub(r'^https?:\/\/.*[\r\n]*', '', item.description.text, flags=re.MULTILINE).strip() #item.description.text
            news_item['link'] = item.link.text
            news_item['key'] = str(key)  # add key for sorting later
            news_item['polarity'] = str(0)    # add place holders for data mining
            news_item['subjectivity'] = str(0)
            news_item['text'] = " -none- "
            
            for sterm in searchtermlist:
                tmpstr = item.title.text.lower()
                
                if sterm == "*" or tmpstr.find(sterm.lower()) > 0:
                    news_item['text'] = scrapeText(item.link.text)
                    if len(news_item['text']) >= 700 :       # only save stories with accessible text
                        news_items.append(news_item)  
                        print("item: "+news_item['key']+" added len = "+str(len(news_item['text'])))
                    break
            
        print("======== "+str(key)+" news items read = "+str(len(items))+" ") #debug report
    
    return news_items


from newspaper import Article 

# Function to scrape the story text from a news webpage using newspaper library 
def scrapeText(link):
    test_article = Article(link, language="en")
    test_article.download()
    test_article.parse()
    text = test_article.text

    return text



# Function to pickle news_items list    
def saveNewsItems(news_items, fname):    
    # save the news items
    pickle.dump(news_items, open(fname, "wb"))
    print("==== File:   " + fname + " ... Pickled with "+str(len(news_items))+" stories written.")
    return

# Function to display news items library
#  uses flag parameter to limit output 
#  0 = just news sources
#  1 = news source + title
#  2 = news source + title + description
#  3 = news source + title + description + text
def displayNews(news_items, flag):
    for item in news_items:
        print("----"+item['key']+"----")
        if(flag >= 1):
            print("[TITLE] "+item["key"]+" - "+item["title"])
        if(flag >= 2):
            print("[DESC] "+item["description"]) 
        if(flag >= 3 and len(item["text"]) > 100):
            print("[TEXT] "+item["text"])
    return

import copy

# Function to combine the description text from multiple stories from the same source
# into single text strings.
def combineNews(news_items):
    cnt = 0
    prev_item = copy.deepcopy(news_items[0])
    new_item = copy.deepcopy(news_items[0])
    cnews_items = []
    for item in news_items:
        new_item['key'] = item['key']
        new_item['polarity'] = item['polarity']            
        new_item['subjectivity'] = item['subjectivity'] 
 
        if item['key'] == prev_item['key']:
            new_item['title'] = new_item['title'] + ' '+item['title'].strip()
            new_item['description'] = new_item['description'] + ' '+item['description'].strip() 
        else:
            new_item['title'] = item['title'].strip()
            new_item['description'] = item['description'].strip() 
            if cnt > 0 :
                cnews_items.append(prev_item)
            
        prev_item = copy.deepcopy(new_item)
        cnt += 1
        
    cnews_items.append(prev_item)  # get last combined item
    return cnews_items

In [26]:
# Get user search terms list
inputstring = input("Enter search terms separated by spaces (* for all): ")
searchtermlist = inputstring.split()


Enter search terms separated by spaces (* for all):  trump


In [27]:
# Main code body

news_items = []   # list for storing all news items

news_items = scrapeFeeds(rssurls, searchtermlist)
    
# use date and search terms to generate unique filename for pickle file
now = datetime.datetime.now()
date_string = now.strftime('%Y-%m-%d')
fname = "news"
for sterm in searchtermlist:
    if sterm != '*':
        fname = fname+"_"+sterm
    else:
        fname = fname+"_all"
        
fname = "x"+fname+"_"+date_string+".p"
saveNewsItems(news_items, fname)    

  

item: telegraph added len = 332
item: cnbc added len = 243
item: cnbc added len = 2110
item: cnn added len = 224
item: cnn added len = 14321
item: cnn added len = 286
item: cnn added len = 299
item: cnn added len = 1969
item: cnn added len = 642
item: cnn added len = 6480
item: cnn added len = 8364
item: cnn added len = 5780
item: cnn added len = 6752
item: washtimes added len = 17631
item: washtimes added len = 1904
item: bbc added len = 363
item: bbc added len = 2924
item: bbc added len = 249
item: bbc added len = 504
item: bbc added len = 5484
item: bbc added len = 297
item: bbc added len = 504
item: bbc added len = 625
item: bbc added len = 536
item: bbc added len = 6432
item: bbc added len = 200
item: bbc added len = 278
item: bbc added len = 241
item: bbc added len = 235
item: bbc added len = 417
item: bbc added len = 297
item: nyt added len = 1755
item: nyt added len = 2564
item: nyt added len = 1706
item: nyt added len = 54253
item: nyt added len = 1832
item: nyt added len = 17

In [28]:
displayNews(news_items, 3)

----telegraph----
[TITLE] telegraph - Biden Focuses on Pennsylvania, Trump Hits Five Battleground States
[DESC] On the last Sunday before Election Day, Joe Biden homed in on Pennsylvania, while President Trump hit five battleground states including Florida and Georgia. A caravan of Trump supporters slowed traffic in New York City. Photos: Drew Angerer/Getty Images; Joe Raedle/Getty Images
[TEXT] media_play

2016 vs. 2020: The Evolution of Russian Meddling in U.S. Elections

U.S. authorities and tech companies have reported several instances of Russian cyberattacks and interference attempts ahead of the 2020 election. Here’s their account of how Russian hackers and trolls have expanded their 2016 tool kit with new tactics.
----cnbc----
[TITLE] cnbc - The 2020 election race remains tight between Trump and Biden
[DESC] NBC's Steve Kornacki joins Shep Smith to talk about what would happen if polls are as inaccurate as the last presidential election in 2016. He also talks about data on earl