In [6]:
#  CSC6740 Term Project
#  Georgia State University - Fall 2020
#  William Keith Dobson
#  Version 1.0
#  Program to scrape and store top news story headlines, descriptions, and links from a list of RSS news feeds

# Library dependencies
import requests
from bs4 import BeautifulSoup
import pickle
import datetime 
import re

# Create dictionary of news rss feeds
# Some of the rss feeds were removed due to embedded href= links in text tags
# until a satisfactory filtering method is found.

rssurls = {\
#          'telegraph':'https://www.dailytelegraph.com.au/news/world/rss',\
           'cnbc':'https://www.cnbc.com/id/100727362/device/rss/rss.html',\
           'cnn':'http://rss.cnn.com/rss/cnn_topstories.rss',\
#           'guardian':'http://www.theguardian.com/world/usa/rss',\
#           'aljazeera':'http://www.aljazeera.com/xml/rss/all.xml',\
#           'csmonitor':'https://rss.csmonitor.com/feeds/world',\
#           'cbn':'https://www1.cbn.com/app_feeds/rss/news/rss.php?section=world&mobile=false&q=cbnnews/world/feed',\
           'washtimes':'http://www.washingtontimes.com/rss/headlines/news/world',\
           'bbc':'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml',\
           'nyt':'http://feeds.nytimes.com/nyt/rss/HomePage',\
           'npr':'http://www.npr.org/rss/rss.php?id=1001',\
           'washpost':'http://feeds.washingtonpost.com/rss/world',\
#           'rtnews':'https://www.rt.com/rss/news/',\
           'nbcnews':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'wsj':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'foxnews':'http://feeds.foxnews.com/foxnews/latest?format=xml',\
           'cbsnews':'https://www.cbsnews.com/latest/rss/world'
          }


# Function to scrape titles and descriptions from a list of rss feeds using a search term list to filter
# results then returns a list of news items.  Note that search algorithm uses OR logic so if any of the 
# search terms are present the story is added to the list

def scrapeFeeds(rssurls, searchtermlist):
    now = datetime.datetime.now()
    date_string = now.strftime('%Y-%m-%d')

    news_items = []   # list for storing all news items    
    for key, url in rssurls.items():
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'xml')
        items = soup.findAll('item')

        #print(soup.prettify())   #debug code
    
        # Extract title, description, and link for each news item
        for item in items:
            news_item = {}
            news_item['title'] = item.title.text.strip()
            news_item['description'] = re.sub(r'^https?:\/\/.*[\r\n]*', '', item.description.text, flags=re.MULTILINE).strip() #item.description.text
            news_item['date'] = date_string
            news_item['link'] = item.link.text
            news_item['key'] = str(key)  # news src key for sorting later
            news_item['sterm'] = 'none'
            news_item['polarity'] = str(0)    # add place holders for data mining
            news_item['subjectivity'] = str(0)
            news_item['text'] = "none"
            
            for sterm in searchtermlist:
                tmpstr = item.title.text.lower()+" "+item.description.lower()
               
                if sterm != "*" and tmpstr.find(sterm.lower()) > 0:
                    news_item['text'] = scrapeText(item.link.text)
                    news_item['sterm'] = sterm
                    news_items.append(news_item)
                    break
                if sterm == "*":
                    news_items.append(news_item)
                    break
                    
            
        print("======== "+str(key)+" news items read = "+str(len(items))+" ") #debug report
    return news_items

from newspaper import Article    # this is a better tool for capturing article text bodies than beautiful soup

# Function to scrape the story text from a news webpage using newspaper library 
def scrapeText(link):
    try: 
        test_article = Article(link, language="en", timeout = 20)
        test_article.download()
        test_article.parse()
        text = test_article.text
    except:
        text = "*** scrapeText exception: "+link
        print(text)
        pass
    
    return text

# Function to pickle news_items list    
def saveNewsItems(news_items, fname):    
    # save the news items
    pickle.dump(news_items, open(fname, "wb"))
    print("==== File:   " + fname + " ... Pickled with "+str(len(news_items))+" stories written.")
    return

# Function to display news items library
#  uses flag parameter to limit output 
#  0 = just news sources
#  1 = news source + title
#  2 = news source + title + description
#  3 = news source + title + description + text
def displayNews(news_items, flag):
    for item in news_items:
        print("----"+item['key']+"----")
        if(flag >= 1):
            print("[TITLE] "+item["key"]+" - "+item["title"])
        if(flag >= 2):
            print("[DESC] "+item['description']) 
        if(flag >= 3):
            print("[TEXT] "+item['text'])   
    return



In [9]:
# Get user search terms list
print("Enter pre-filter search terms separated by spaces (* for all, Warning this option may be time consuming)")
inputstring = input("Enter search terms: ")
searchtermlist = inputstring.split()




Enter search terms:  covid


In [10]:
# Main code body
print("    Note: Web scraping takes time so stand by...")
news_items = []   # list for storing all news items

news_items = scrapeFeeds(rssurls, searchtermlist)
    
# use date and search terms to generate unique filename for pickle file
now = datetime.datetime.now()
date_string = now.strftime('%Y-%m-%d')

for sterm in searchtermlist:
    if sterm != '*':
        fname = "news/news_"+sterm+"_"  # path/filename for searched items with article text body scraped
        
    else:
        fname = "rss/rss_all"+"_"       # path/filename for just base rss feeds

fname = fname+date_string+".p"        
saveNewsItems(news_items, fname)    # save RSS feed or searched news articles into a file for the current day

  

    Note: Web scraping takes time so stand by...
==== File:   news/news_covid_2020-11-21.p ... Pickled with 34 stories written.


In [26]:
news_items

[{'title': 'Op-Ed: Why the U.S. election still could represent a triumph of American democracy',
  'description': "Joe Biden will become the 46th president of the United States. He'll do so with the largest number of votes ever cast for any American presidential candidate.",
  'date': '2020-11-09',
  'link': 'https://www.cnbc.com/2020/11/07/op-ed-why-the-us-election-still-could-represent-a-triumph-of-american-democracy.html',
  'key': 'cnbc',
  'sterm': 'election',
  'polarity': '0',
  'subjectivity': '0',
  'text': 'People react as media announce that Democratic U.S. presidential nominee Joe Biden has won the 2020 U.S. presidential election, in Los Angeles, California, U.S., November 7, 2020. Patrick T. Fallon | Reuters\n\nIt is now likely that former Vice President Joe Biden will become the 46th president of the United States. He\'ll do so with the largest number of votes ever cast for any American presidential candidate in history after an electoral-turnout rate that was the highest