# Sentiment Analysis
This file is used for performing sentiment analysis, both on an article as a whole, and on sentences that contain the main topic(s) related to that article for topic-based sentiment.

# Main Pipeline Function

In [1]:
def main_pipeline_sentiment_analysis(urls):
    """
    This is the main sentiment analysis pipeline function that is responsible for first creating our sentiment analysis
    dataframe. Taking in a list of URLs, it iterates through each of them, webscrapes the article text from the URL,
    and then performs sentiment analysis on that text. As it iterates, the data is being added to our sentimentDic dictionary.
    After we scrape and analyze all URLs in the list, we turn it into a dataframe, remove all the rows that failed 
    webscraping, and return the dataframe.
    """
    # Loops through our URLS and scraps the data
    # Put all empty dictionaries here
    sentimentDic = {}

    for count, x in enumerate(urls):
        if(count % 10 == 0): #layman's way of showing progress
            print(str(count), " Articles Completed")

        url = x #the url of the article we want to webscrape and analyze

        # Send the URL to get scraped, returning the text of the article
        page_text = scrapeData(x)

        # Runs sentiment analysis. Will need to make a new function and a new dictionary
        # for each type of analysis we want to run. Will pass in the page_text, the dic, and
        # x (the url)
        sentimentDic = sentimentAnalysis(page_text, sentimentDic, url)


    # For each analysis we run we need to then convert that dictionary with the following method
    df = dictionaryToDataFrame(sentimentDic)
    
    #Clean dataframe by dropping all rows that failed webscraping
    df = drop_failed_webscraping_rows(df)
    
    return df

# Article Level Sentiment Analysis
This function is used to perform sentiment analysis on a single document text. Analysis is done with TextBlob with the help of Spacy.

In [None]:
def sentimentAnalysis(text, dictionary, url):
    """
    The main function for sentiment analysis of a single article. Takes in the scraped article text, the sentiment analysis
    dictionary to add the data to, and the url of the article. We add the url to our dictionary, then perform sentiment
    analysis on the text by gathering the polarity (sentiment) and subjectivity scores of the text. We label the sentiment
    score Positive-Neutral-Negative and in-between, and then add all of the necessary data we generated to the dictionary.
    We then create lists of positive words and negative words, adding those to the dictionary as well. At the bottom, if
    the web scraping failed for this article, we add results to the dictionary that signal there was an error. Finally
    we return the dictionary with its new entry.
    """
    if(len(dictionary) == 0):
        dictionary = {
            "URL": [],
            "Sentiment Score": [],
            "Sentiment Label": [],
            "Subjectivity Score": [],
            "Positive Words": [],
            "Negative Words": [],
            "Text": []
            }
        
    # If there was an error while parsing the document we will not do any sentiment analysis
    # on the article text.
    if(text[0:8] != "PARERROR"):
        # Start the sentiment analysis now
        dictionary["URL"].append(url)
        doc = nlp(text)
        
        # Get's sentiment and subjectivity
        sentiment = doc._.blob.polarity
        sentiment = round(sentiment,2)
        subjectivity = doc._.blob.subjectivity
        subjectivity = round(subjectivity,2)

        # Gives positive or negative label
        if sentiment >= 0.033 and sentiment <= 0.043:
            sent_label = "Neutral"
        elif sentiment > 0.043 and sentiment < 0.143:
            sent_label = "Neutral Positive"
        elif sentiment > 0.143:
            sent_label = "Positive"
        elif sentiment < 0.033 and sentiment > -0.062:
            sent_label = "Neutral Negative"
        elif sentiment < -0.062:
            sent_label = "Negative"
    
        # Appending labels to the dictionary
        dictionary["Sentiment Label"].append(sent_label)
        dictionary["Sentiment Score"].append(sentiment)
        dictionary["Subjectivity Score"].append(subjectivity)
        dictionary["Text"].append(text)

        positive_words = []
        negative_words = []
    
        # Creating a list of positive and negative words
        for x in doc._.blob.sentiment_assessments.assessments:
          if x[1] > 0:
            positive_words.append(x[0][0])
          elif x[1] < 0:
            negative_words.append(x[0][0])
          else:
            pass

        dictionary["Positive Words"].append(', '.join(set(positive_words)))
        dictionary["Negative Words"].append(', '.join(set(negative_words)))
    
    # Hits here if there was a scraping error
    else:
        dictionary["URL"].append(url)
        dictionary["Sentiment Label"].append(text)
        dictionary["Sentiment Score"].append(0.0)
        dictionary["Subjectivity Score"].append(0.0)
        dictionary["Text"].append(text)

        positive_words = []
        negative_words = []

        dictionary["Positive Words"].append(', '.join(set(positive_words)))
        dictionary["Negative Words"].append(', '.join(set(negative_words)))
    
    return dictionary

# Topic Level Sentiment
This code block is used to preform the sentiment analysis based on the topic word(s) of an article. It will perform sentiment on the sentences that contain the topic(s).

In [None]:
#returns a dictionary of all topics, with all their associated topic words in the form {Topic_num: [words]}
def create_topic_words_dict(ldamodel):
    """
    Taking in the LDA model, this returns a dictionary of topics, where the values for each topic is a list of its
    top 10 associated words.
    """
    my_dict = {i: [token for token, score in ldamodel.show_topic(i, topn=10)] for i in range(0, ldamodel.num_topics)}
    
    return my_dict


def get_sentences(doc):
    """
    Taking in a Spacy doc object, returns all sentences of the doc as a list.
    """
    return doc.sents


def sentence_sentiment_from_doc(doc):
    """
    Taking in a Spacy doc object, return a list of tuples of all sentences in the doc and their associated sentiment
    value (for that sentence) in the form (sentence, sentiment value).
    """
    sentences = get_sentences(doc)
    tuple_list = []
    for sentence in sentences:
        sent_doc = nlp(sentence.text)
        tuple_list.append((sentence.text,sent_doc._.blob.polarity)) #list of tuples of form [(text, sentiment)]
    return tuple_list


def sentence_sentiment_on_topics(doc, topic_list):
    """
    Taking in a Spacy doc object and a generated topic word dictionary (from create_topic_words_dict()), this iterates
    through every topic in the dictionary and goes through every word of every topic. For each word, it finds all occurences
    of the word in the doc and the local sentiment scores for that word. Then, when we have all sentiment values
    for all words in a topic, averages out the sentiment score for that topic and adds it to a dictionary. Finally,
    we return the dictionary of all topics and their averaged sentiment scores for each of those topics for that single
    article.
    """
    sentence_sentiment_list = sentence_sentiment_from_doc(doc) #get all sentences and their sentiment
    score_list = []
    return_dict = {}
    
    for key in topic_list: #for every topic
        for word in topic_list[key]: #for every word in that topic
            for sentence, sentiment in sentence_sentiment_list:
                 if sentence.find(word) != -1: #if the word is in that sentence we add the sentiment value
                        score_list.append(sentiment)
        if not score_list:
            return_dict[key] = 0
        else:
            return_dict[key] = sum(score_list) / len(score_list) #average of all sentence sentiments for topic
    
    return return_dict


def topic_sentence_sentiment_analysis(df, LDA_model, corpus):
    """
    Takes in our main sentiment analysis dataframe, our LDA model, and the LDA model corpus. This uses all of the functions
    above it to generate a dataframe of all articles (rows) and all topics (columns) where the cell corresponds to the 
    localized sentiment value for that topic on that article.

    This is done by gathering each document text and making it a Spacy doc object, then performing our localized sentiment
    analysis function on that article and adding that to a larger dictionary of all articles. This will return a dictionary
    of dictionaries, where each key is an article URL and the value is a dictionary of all topics and their sentiment values
    for that article.
    """
    topicSentDic = {}
    topic_list = create_topic_words_dict(LDA_model) #list of topics and their words
    
    for x in range(len(df["URL"])): #for every article
        page_text = df.iloc[x]["Text"]
        tempdoc = nlp(page_text) #gather page text and transform into doc object
        
        temp = sentence_sentiment_on_topics(tempdoc,topic_list) #dictionary of all topics and their average sentiment for the article
        topicSentDic[x] = temp #append sentiment dict
    
    return topicSentDic

In [None]:
def word_sentiment_per_doc(word, text):
    """
    DEPRECATED FUNCTION (not in use). Takes in a word (string) and a Spacy doc object, and get the localized sentiment
    value for that word in the document. Similar to our sentence_sentiment_on_topics() function but only does it for
    one word.
    """
    doc = nlp(text)
    sentence_sentiment_list = sentence_sentiment_from_doc(doc) # get all sentences and their sentiment
    word_score = 0
    total_appearences = 0
    
    for sentence, sentiment in sentence_sentiment_list:
        if sentence.find(word) != -1: #if the word is in that sentence we add the sentiment value
            word_score += sentiment 
            total_appearences += 1
    
    if total_appearences == 0:
        return None
    
    word_sentiment = word_score / total_appearences
    return word_sentiment

def topic_sentiment_per_doc(topics, text):
    """
    DEPRECATED FUNCTION (not in use). Takes in a list of relevant topics and the text of an article, and returns
    the local sentiment score for the topic on that article (using a weighted score).
    """
    topic_sentiment_df = []
    
    for i, topic_tuple in enumerate(topics):
        topic_id, topic = topic_tuple
        # For each topic
        weighted_topic_sentiment = 0
        for word, score in topic:
            # For each word in a topic
            # Multiply the relavence by the sentiment to get a weighted sentiment
            word_sentiment = word_sentiment_per_doc(word, text)
            if word_sentiment != None:
                weighted_word_sentiment = score * word_sentiment
                weighted_topic_sentiment += weighted_word_sentiment
        topic_sentiment_df.append((topic_id, weighted_topic_sentiment))
        
    return topic_sentiment_df