In [5]:
#  CSC6740 Term Project
#  Georgia State University - Fall 2020
#  William Keith Dobson

# Add description

# Library dependencies
import requests
from bs4 import BeautifulSoup
import pickle
import datetime 
import re

# Create dictionary of news rss feeds
# Some of the rss feeds were removed due to embedded href= links in text tags
# until a satisfactory filtering method is found.

rssurls = {'telegraph':'https://www.dailytelegraph.com.au/news/world/rss',\
           'cnbc':'https://www.cnbc.com/id/100727362/device/rss/rss.html',\
           'cnn':'http://rss.cnn.com/rss/cnn_topstories.rss',\
#           'guardian':'http://www.theguardian.com/world/usa/rss',\
#           'aljazeera':'http://www.aljazeera.com/xml/rss/all.xml',\
#           'csmonitor':'https://rss.csmonitor.com/feeds/world',\
#           'cbn':'https://www1.cbn.com/app_feeds/rss/news/rss.php?section=world&mobile=false&q=cbnnews/world/feed',\
           'washtimes':'http://www.washingtontimes.com/rss/headlines/news/world',\
           'bbc':'http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml',\
           'nyt':'http://feeds.nytimes.com/nyt/rss/HomePage',\
           'npr':'http://www.npr.org/rss/rss.php?id=1001',\
           'washpost':'http://feeds.washingtonpost.com/rss/world',\
#           'rtnews':'https://www.rt.com/rss/news/',\
           'nbcnews':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'wsj':'http://feeds.nbcnews.com/nbcnews/public/news',\
           'foxnews':'http://feeds.foxnews.com/foxnews/latest?format=xml',\
           'cbsnews':'https://www.cbsnews.com/latest/rss/world'}


# Function to scrape titles and descriptions from a list of rss feeds using a search term list to filter
# results then returns a list of news items.  Note that search algorithm uses OR logic so if any of the 
# search terms are present the story is added to the list

def scrapeFeeds(rssurls, searchtermlist):
    news_items = []   # list for storing all news items    
    for key, url in rssurls.items():
        resp = requests.get(url)
        soup = BeautifulSoup(resp.content, 'xml')
        items = soup.findAll('item')
    
        #print(soup.prettify())   #debug code
    
        # Extract title, description, and link for each news item
        for item in items:
            news_item = {}
            news_item['title'] = item.title.text.strip()
            news_item['description'] = re.sub(r'^https?:\/\/.*[\r\n]*', '', item.description.text, flags=re.MULTILINE).strip() #item.description.text
            news_item['link'] = item.link.text
            news_item['key'] = str(key)  # add key for sorting later
            news_item['polarity'] = str(0)    # add place holders for data mining
            news_item['subjectivity'] = str(0)
            
            for sterm in searchtermlist:
                tmpstr = item.title.text.lower()
                
                if sterm == "*" or tmpstr.find(sterm.lower()) > 0:
                    news_items.append(news_item)
                    break
            
        print("======== "+str(key)+" news items read = "+str(len(items))+" ") #debug report
    
    return news_items

# Function to pickle news_items list    
def saveNewsItems(news_items, fname):    
    # save the news items
    pickle.dump(news_items, open(fname, "wb"))
    print("==== File:   " + fname + " ... Pickled with "+str(len(news_items))+" stories written.")
    return

# Function to display news items library
#  uses flag parameter to limit output 
#  0 = just news sources
#  1 = news source + title
#  2 = news source + title + description
#  3 = news source + title + description + text
def displayNews(news_items, flag):
    for item in news_items:
        print("----"+item['key']+"----")
        if(flag >= 1):
            print(item["key"]+" - "+item["title"])
        if(flag >= 2):
            print(item["description"]) 
        if(flag >= 3):
            print("[DESC] "+item['description'])
    return

import copy

# Function to combine the description text from multiple stories from the same source
# into single text strings.
def combineNews(news_items):
    cnt = 0
    prev_item = copy.deepcopy(news_items[0])
    new_item = copy.deepcopy(news_items[0])
    cnews_items = []
    for item in news_items:
        new_item['key'] = item['key']
        new_item['polarity'] = item['polarity']            
        new_item['subjectivity'] = item['subjectivity'] 
 
        if item['key'] == prev_item['key']:
            new_item['title'] = new_item['title'] + ' '+item['title'].strip()
            new_item['description'] = new_item['description'] + ' '+item['description'].strip() 
        else:
            new_item['title'] = item['title'].strip()
            new_item['description'] = item['description'].strip() 
            if cnt > 0 :
                cnews_items.append(prev_item)
            
        prev_item = copy.deepcopy(new_item)
        cnt += 1
        
    cnews_items.append(prev_item)  # get last combined item
    return cnews_items

In [2]:
# Get user search terms list
inputstring = input("Enter search terms separated by spaces (* for all): ")
searchtermlist = inputstring.split()


Enter search terms separated by spaces (* for all):  *


In [3]:
# Main code body

news_items = []   # list for storing all news items

news_items = scrapeFeeds(rssurls, searchtermlist)
    
# use date and search terms to generate unique filename for pickle file
now = datetime.datetime.now()
date_string = now.strftime('%Y-%m-%d')
fname = "news"
for sterm in searchtermlist:
    if sterm != '*':
        fname = fname+"_"+sterm
    else:
        fname = fname+"_all"
        
fname = fname+"_"+date_string+".p"
saveNewsItems(news_items, fname)    

  

==== File:   news_all_2020-11-03.p ... Pickled with 419 stories written.


In [6]:
displayNews(news_items, 3)

----telegraph----
telegraph - Melania ‘only person not wearing a mask’
Welcome to our live coverage of the US presidential election.


KeyError: 'text'

In [45]:
nurl = 'https://www.dailytelegraph.com.au/news/world/us-election-2020-donald-trump-v-joe-biden-as-america-votes/news-story/c1e804872a9c378245242ee575390541'
resp = requests.get(nurl)
soup = BeautifulSoup(resp.content, 'html.parser')
article = soup.findAll('p')
soup.get_text()

'\n\n\nDailytelegraph.com.au | Subscribe to The Daily Telegraph for exclusive stories\n\n\n\n\n\n\n\n\n\r\n    html, body {\r\n        margin: 0;\r\n        padding: 0;\r\n\t\toverflow-x: hidden!important;\r\n    }\r\n    .grecaptcha-badge {\r\n        display: none;\r\n    }\r\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'

In [46]:
len(article)

0

In [48]:
from newspaper import Article 
nurl = 'https://www.dailytelegraph.com.au/news/world/us-election-2020-donald-trump-v-joe-biden-as-america-votes/news-story/c1e804872a9c378245242ee575390541'
test_article = Article(nurl, language="en")
test_article.download()
test_article.parse()
print(test_article.text)




In [14]:
news_items[1]

{'title': 'Biden’s lack of presence no match for Trump fever',
 'description': 'For a little while there over the weekend, Joe Biden seemed to kickstart back to life.',
 'link': 'https://www.dailytelegraph.com.au/news/world/analysis-bidens-lack-of-presence-no-match-for-trump-fever/news-story/caaf087fa417c1d2a92be3584b68421f',
 'key': 'telegraph',
 'polarity': '0',
 'subjectivity': '0'}