In [1]:
#Import libraries for the function
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
import numpy as np
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from statistics import mean
import pandas as pd


def filter_subj_sentences(text: str) -> list:
    
    """Create a function using NLTK and Textblob package to filter
    sentences in 'text' variable within a range of subjectivity
    """
    
    #Tokenize text and add SentimentIntensityAnalyzer to a variable
    sent_tokenize_list = sent_tokenize(text)
    filtered_text = []
    sid = SentimentIntensityAnalyzer()
    
    #Only filters speeches with more than 50 sentences
    if len(sent_tokenize_list) > 50:
        #Filter sentences with subjectivity less than 0.2 or more than 0.85
        for sentence in sent_tokenize_list:
            ss = list(TextBlob(sentence).sentiment) #ss on list format will give us [polarity, subjectivity]
            if 0.2 < ss[1] < 0.85:
                filtered_text.append(sentence)
                
    #Else we won't filter the speech            
    else:
        filtered_text = sent_tokenize_list
    
    #This function will return a the text tokenized and filtered on a list
    return filtered_text

#TEST THE FUNCTION
#assert type(filter_subj_sentences(text)) == list

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\diego\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!




In [2]:
def get_sentimental_values(sent_tokenize_list: list) -> list:
    
    """Creates a list which contains the average of the 25 most positive
    and 25 most negative sentences.    
    """
    #Creates variables and list
    score_compound = []
    score_neg=[]
    score_pos=[]
    sid = SentimentIntensityAnalyzer()
    
    #For loop using previous function to get the score of every sentence
    for sentence in sent_tokenize_list:
        ss = sid.polarity_scores(sentence)
        score_compound.append(ss['compound'])
        score_neg.append(ss['neg'])
        score_pos.append(ss['pos'])

    #Covert the lists of scores into arrays so we can play with the values
    score_compound_array = np.array(score_compound)
    score_pos_array = np.array(score_pos)
    score_neg_array = np.array(score_neg)

    #Index filtering the 25 most positive sentences
    max_pos_sent_idx = score_pos_array.argsort()[-25:][::-1]
    max_pos_sent = []
    max_pos_scr = []

    #Index filtering the 25 most negative sentences
    max_neg_sent_idx = score_neg_array.argsort()[-25:][::-1]
    max_neg_sent = []
    max_neg_scr = []

    #Get the positive scores on a final list, based on the previous filter
    for i in max_pos_sent_idx:
        max_pos_sent.append(sent_tokenize_list[i])
        max_pos_scr.append(score_pos[i])
    max_pos_sent

    #Get the negative scores on a final list, based on the previous filter
    for i in max_neg_sent_idx:
        max_neg_sent.append(sent_tokenize_list[i])
        max_neg_scr.append(score_neg[i])
    max_neg_sent

    #Return a list with [average positive score, average negative score]
    return [mean(max_pos_scr),mean(max_neg_scr)]

In [3]:
def sentimental_analysis(dataframe):
    
    """Assign every positive, negative and compound score to each speech
    while calculating the average compound per speech
    """
    
    #Create empty lists for the function
    positive_scores = []
    negative_scores = []
    compound_scores = []
    
    #For loop to evaluate every speech
    for row in dataframe['speeches']:
        
        #If statemente to avoid empty speeches
        if row != []:
            
            #Apply Sentiment Analysis to each sentence and append its score to a list
            positive_scores.append(get_sentimental_values(filter_subj_sentences(row))[0])
            negative_scores.append(get_sentimental_values(filter_subj_sentences(row))[1])
            score = []

            #Calculate the total compound of the speech
            for sentence in filter_subj_sentences(row):
                sid = SentimentIntensityAnalyzer()
                ss = sid.polarity_scores(sentence)
                score.append(ss['compound'])
            
            compound_scores.append(mean(score))
    
    #Add columns to the input dataframe
    dataframe['positive'] = positive_scores
    dataframe['negative'] = negative_scores
    dataframe['compound'] = compound_scores
    
    #return the new dataframe
    return dataframe

In [4]:
data_for_analysis = pd.read_pickle("./speech_data.pkl")

In [5]:
data_for_analysis = sentimental_analysis(data_for_analysis)

In [6]:
data_for_analysis.head()

Unnamed: 0_level_0,speeches,president,positive,negative,compound
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-03-19,"[THE PRESIDENT: Thank you to our First Lady, M...",Donald Trump,0.4962,0.31076,0.101022
2018-02-23,[THE PRESIDENT: Thank you very much. Thank you...,Donald Trump,0.83788,0.44108,0.203247
2018-02-15,"[THE PRESIDENT: My fellow Americans, today I s...",Donald Trump,0.18887,0.137696,0.029152
2018-02-01,"[THE PRESIDENT: Thank you, Paul and Mitch, for...",Donald Trump,0.59952,0.23956,0.353166
2018-01-30,"[Mr. Speaker, Mr. Vice President, Members of C...",Donald Trump,0.29468,0.2184,0.174994


In [7]:
data_for_analysis.tail()

Unnamed: 0_level_0,speeches,president,positive,negative,compound
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1903-10-20,[By the President of the United States of Amer...,Theodore Roosevelt,0.164,0.0,0.9892
1902-12-02,[To the Senate and House of Representatives:We...,Theodore Roosevelt,0.33336,0.2338,0.319026
1902-06-13,[To the Senate and House of Representatives:I ...,Theodore Roosevelt,0.19936,0.07692,0.340781
1901-12-03,[To the Senate and House of Representatives:Th...,Theodore Roosevelt,0.34924,0.3034,0.260799
1901-09-14,[By the President of the United States of Amer...,Theodore Roosevelt,0.21,0.099667,0.718633


In [8]:
data_for_analysis.to_pickle("./scores_data.pkl")