In [168]:
####################################Imports and Initial Feed Rip###################################################



#Requires PySpark, installation information can be found here: https://www.dataquest.io/blog/installing-pyspark/
#import PySpark as ps 
import feedparser as fp, pandas as pd, datetime
from bs4 import BeautifulSoup 
from functools import reduce

#Get the News Feed and Slice it to only include the news items
feed = fp.parse('http://feeds.huffingtonpost.com/c/35496/f/677097/index.rss')
feed_items = feed['items']

###################################################################################################################

In [169]:
#################################Create the Cleaned News Items DataFrame###########################################

#Import the raw data into a DataFrame
raw_news_dataframe = pd.DataFrame(feed_items)

#Clean the DataFrame
def dataframe_cleaner(dataframe): 
    being_cleaned_df = pd.DataFrame(raw_news_dataframe[['author','id','title','summary']])
    
    for i in range(0,len(being_cleaned_df['summary'])):
        soup = BeautifulSoup(being_cleaned_df['summary'][i],'lxml')
        being_cleaned_df['summary'][i] = soup.get_text()
    
    return being_cleaned_df
    
#Create the cleaned Dataframe    
news_df = dataframe_cleaner(raw_news_dataframe)
news_df.head(1)

###################################################################################################################

Unnamed: 0,author,id,title,summary
0,Natalia Brzezinski,http://www.huffingtonpost.com/natalia-lopatniu...,A Brilliant Mind: Founding Tech Companies with...,There's wide global awareness today that we ne...


In [170]:
###############################Import, Clean and DataFrame the Sentiment Dictionary################################

#Import the Sentiment Dictionary 
f = open('sentiment_dictionary.tff','r')
raw_sentiment_dictionary = f.read()

#Cleanse the Dictionary and create a Raw Sentiment DataFrame
def dictionary_to_dataframe(dictionary):
    
    being_cleaned_list = list(map(lambda x: x[0].split(" "),[x.split(',') for x in dictionary.split('\n')]))
    being_cleaned_df = pd.DataFrame(being_cleaned_list)
    being_cleaned_df = being_cleaned_df[[0,2,3,4,5]]
    being_cleaned_df.columns = ['type','word','position','stemmed','polarity']
    
    return being_cleaned_df


#Create a Totally Cleaned Sentiment Dataframe
def sentiment_dataframe_maker(file):
    
    #Call another function to turn the file into a raw dataframe
    raw_df = dictionary_to_dataframe(file)
    raw_df = raw_df.dropna()
    #Clean up the strings within the cells
    replace_list = ['type','word','position','stemmed','polarity']
    
    for x in replace_list:
        raw_df[x] = list(map(lambda x: x.split('=')[-1],raw_df[x]))
        
    #Convert Type and Polarity to numerical values 
    
    type_unique = dict.fromkeys(raw_df['type'].unique())
    type_unique['weaksubj'] = .5
    type_unique['strongsubj'] = 1
    
    for key,value in type_unique.items():
        raw_df['type'] = raw_df['type'].replace(key,value)  
    
    polarity_unique = dict.fromkeys(raw_df['polarity'].unique())
    polarity_unique['negative'] = int(-1)
    polarity_unique['positive'] = int(1)
    polarity_unique['neutral'] = 0
    polarity_unique['both'] = None
    polarity_unique['m'] = None
    
    for key,value in polarity_unique.items():
        raw_df['polarity'] = raw_df['polarity'].replace(key,value)
    
    raw_df = raw_df.dropna()
    
    return raw_df
    

#Create the Cleaned Sentiment Dataframe
sentiment_df = sentiment_dataframe_maker(raw_sentiment_dictionary)
sentiment_df.head(1)

###################################################################################################################

Unnamed: 0,type,word,position,stemmed,polarity
0,0.5,abandoned,adj,n,-1


In [175]:
##########################Perform Sentiment Analysis on the Title and Body of the Articles#########################
def title_analysis(title):
    tokenized_title = dict.fromkeys(title.split(' '))
    
    for key in tokenized_title:
        tokenized_title[key] = [sentiment_df['type'][i] * sentiment_df['polarity'][i]
                               for i in range(0,len(sentiment_df.index)) 
                               if key.lower() == sentiment_df['word'][i]]
    
    sentiment = sum([item for sublist in list(tokenized_title.values()) for item in sublist])
    return sentiment

def summary_analysis(summary):
    tokenized_summary = dict.fromkeys(summary.split(' '))
    
    for key in tokenized_summary:
        tokenized_summary[key] = [sentiment_df['type'][i] * sentiment_df['polarity'][i]
                               for i in range(0,len(sentiment_df.index)) 
                               if key.lower() == sentiment_df['word'][i]]
    
    sentiment = sum([item for sublist in list(tokenized_summary.values()) for item in sublist])
    return sentiment

def sentiment_analysis(dataframe):
    analyzing_df = dataframe
    analyzing_df['title_sentiment'] = list(map(lambda x: title_analysis(x),analyzing_df['title']))
    analyzing_df['summary_sentiment'] = list(map(lambda x: summary_analysis(x),analyzing_df['summary']))
    return analyzing_df

start_time = str(datetime.datetime.now().time()))
analyzed_articles_df = sentiment_analysis(news_df)
finish_time = str(datetime.datetime.now().time()))
print(start_time,finish_time)

analyzed_articles_df.head()

###################################################################################################################

Start Time: 15:58:12.326582


Unnamed: 0,author,id,title,summary,title_sentiment,summary_sentiment
0,Natalia Brzezinski,http://www.huffingtonpost.com/natalia-lopatniu...,A Brilliant Mind: Founding Tech Companies with...,There's wide global awareness today that we ne...,2.5,58.5
1,Casey Williams,http://www.huffingtonpost.com/2016/03/25/low-i...,"For Poor Americans, Getting Online Is About Mo...",Internet access could soon get a whole lot ch...,0.5,5.5
2,Charles Edge,http://www.huffingtonpost.com/charles-edge/the...,The Importance Of Having A Good Content Strate...,Search engine optimization (SEO) involves stra...,2.5,36.0
3,Kirill Chekanov,http://www.huffingtonpost.com/kirill-chekanov/...,The Wearable Generation,"Hey HuffPost, my name is Kirill, I'm a 20 y.o....",0.0,7.5
4,Laura Dambrosio,http://www.huffingtonpost.com/laura-dambrosio/...,Machine Learning as a Service: How Data Scienc...,Machine learning is an enigma to most. For dec...,0.5,39.0
