In [1]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from textblob import TextBlob
from textblob.taggers import PatternTagger
from textblob.sentiments import PatternAnalyzer
import json
import os

In [23]:
#classify the sentiment value into positive, neutral, and negative
analyzer = PatternAnalyzer() #POLARITY AND SUBJECTIVITY ANALYZER
tagger = PatternTagger() #PART OF SPEECH TAGGER


def pattern_analyzer_classify(sentence):
    score = analyzer.analyze(sentence)
    if score[0] > 0.1:
        return "pos"
    elif score[0] < 0.1:
        return "neg"
    else:
        return "neu"
    
#compare results of data using pattern analyzer compared to correct annotations
def pattern_analyzer_compare(data, annotations): #compares how many we correctly classify
    num_correct = 0
    total = 0
    sentence_list = []
    for i in data['body-paragraphs']:
        for j in i:
            sentence_list.append(j)
    for phrase in annotations["phrase-level-annotations"]:
        if(phrase["id"] == "title"):#line below is where we input our model
            if (pattern_analyzer_classify(data["title"]) == phrase["polarity"]):
                num_correct += 1
            total += 1
        else:
            sentence_id = int(phrase["id"][1:])#input model below here too
            if (pattern_analyzer_classify(sentence_list[sentence_id]) == phrase["polarity"]):
                num_correct += 1
            total += 1
    return total, num_correct

###TAGGER FUNCTIONS
#POS tagging list of tags https://www.learntek.org/blog/categorizing-pos-tagging-nltk-python/

def pattern_tag(sentence):
    blob = TextBlob(sentence, pos_tagger=tagger)
    return blob.pos_tags

def find_proper_names(pos_tags):
    tuple_list = []
    for pos_tuple in pos_tags:
        #find the NNP (Proper nouns) tagged words and return those tuples as a new list
        if pos_tuple[1] == 'NNP':
            tuple_list.append(pos_tuple)
    return tuple_list

def find_full_proper_names_list(pos_tags): #only accounts for first+last names
    name_list = []
    prev_tuple = ("___",'___')
    for pos_tuple in pos_tags:
        #find the NNP tagged words and return those tuples as a new list
        if pos_tuple[1] == 'NNP':
            if prev_tuple[1] == 'NNP':
                name_list.remove(prev_tuple[0])
                name_list.append(prev_tuple[0] + " " + pos_tuple[0])
            else:
                name_list.append(pos_tuple[0])
        prev_tuple = pos_tuple
    return name_list

def name_analysis(sentence):
    p_tags = pattern_tag(sentence) #list of tuple of words and their part of speech
    p_names_list = find_proper_names(p_tags) #list of proper nouns
    
    score_assessments = analyzer.analyze(sentence, keep_assessments = True) #polarity, subjectivity, assessments list
    assessments = score_assessments[2] #get only the list of assessed words
    
    return_list = [] # will be a list of names and their assessed polarity scores
    
    assessed_words = [] #list of just the words that are assessed
    for word in assessments:
        #print(word)
        assessed_words.append(word[0][0])
    
#     assessed_words_and_assessments = []
#     for word in assessments:
#         #print(word)
#         assessed_words.append((word[0][0],word[1]))
    
    for word_tuple in p_tags:
        if word_tuple in p_names_list: #if we find a proper name we calculate the sentiment around it
            #print(word_tuple) 
            
            name_total_assessment_score = 0 #the total assessment for a single name
            
            name_index = p_tags.index(word_tuple)
            
            start_index = 0
            end_index = 0
            
            if name_index - 3 > 0: #we create a word cluster 3 before and 3 after the name
                start_index = name_index - 3
                
            if name_index + 4 > len(p_tags):
                end_index = len(p_tags)
            else:
                end_index = name_index + 4
            #print(assessments)
            #print(p_tags[start_index:end_index][0])
            
            counter = 0 # need to replace counter with any tuple in range, right now it only checks one at a time
            for assessment in assessments: #iterate through assessed words tuple
                #print(assessment)
                #print(p_tags[start_index:end_index][:])
                for i in range(start_index,end_index): #for every assessment, iterate through words within range
                    #we check if this assessment's word matches one in the list
                    if assessment[0][0] in p_tags[i][0].lower(): #CHECK THIs should check if assessed word is in range
                        name_total_assessment_score += assessment[1]
                    counter += 1
                
            return_list.append((word_tuple[0],name_total_assessment_score))
    print(return_list)
    return return_list
#             for adj_word in p_tags[start_index:end_index]: #here we iterate over all of the adjacent words and find if they have been assessed
#                 #print(adj_word)
#                 if adj_word[0].lower() in assessed_words:
#                     list = 
            #I need to get assessments values within the range
            #then add them and add to a list in form (proper_noun,assessment_sum)
    
    
#TEXTBLOB MANIPULATION FUNCTIONS

def textblob_to_sentences_list(tb):
    sent_list = []
    for sentence in tb.sentences:
        sent_list.append(str(sentence))
    return sent_list

In [24]:
###EXPLORATION START
pattern_analyzer1 = PatternAnalyzer()

test_sent1 = "I hate the things that Trump has done over the years."
test_sent2 = "In an Epic Battle of Tanks, Russia Was Routed, Repeating Earlier Mistakes"
test_sent3 = "Ukraine war live updates: Russian mercenary boss says ‘fierce resistance’ seen in Bakhmut; Kyiv says its fighters are under ‘insane pressure’"
test_sent4 = "What’s Worse for Donald Trump Than Getting Indicted?"
test_sent5 = "A fierce debate between Obama and Donald Trump ensued on tuesday."
#pattern analyzer returns a tuple of polarity, subjectivity
#and if you set it to true also a list of assessments made on 
#individual words

#basic sentence analyzer tests
# print("Pattern analyzer test1:")
# print(pattern_analyzer1.analyze(test_sent1)) #polarity and subjectivity

# test_analyzing2 = pattern_analyzer1.analyze(test_sent2,keep_assessments = True)
# print(test_analyzing2) #analyze with assessments of major words

# print("Pattern test3: " + pattern_analyzer_classify(test_sent3))
# print(pattern_analyzer1.analyze(test_sent3))

# textblob1 = TextBlob("Russia returns the broken fighter jet remnants to the US in exchange for political prisoners.")
# tb1_sent_list = textblob_to_sentences_list(textblob1)
# for sentence in tb1_sent_list:
#     name_analysis(sentence)

#print(pattern_analyzer1.analyze(test_sent4, keep_assessments = True))    
name_analysis(test_sent4)    
# print(pattern_analyzer1.analyze(test_sent1)[0])



# #TAGGING EXPLORATION
# tags1 = pattern_tag(test_sent1)
# print(tags1)
# print("\n")
# print(find_proper_names(tags1))
# print(find_full_proper_names_list(tags1))
# print("\n")

# tags4 = pattern_tag(test_sent4)
# print(tags4)
# print("\n")
# print(find_proper_names(tags4))
# print(find_full_proper_names_list(tags4))
# print("\n")

# tags5 = pattern_tag(test_sent5)
# print(tags5)
# print(find_full_proper_names_list(tags5))

[('Donald', -0.4), ('Trump', -0.4)]


[('Donald', -0.4), ('Trump', -0.4)]

In [63]:
file_list = []

#open BASIL dataset and gather the articles w/o scores
for i in range(10):
    file_i = os.listdir("BASILdata/articles/201" + str(i))
    for file_name in file_i:
        file = open("BASILdata/articles/201" + str(i) + "/" + file_name, encoding="utf8")
        json_file = json.load(file)
        file_list.append(json_file)
        
annotation_file_list = []

#open BASIL dataset and gather annotations for articles
for i in range(10):
    file_i = os.listdir("BASILdata/annotations/201" + str(i))
    for file_name in file_i:
        file = open("BASILdata/annotations/201" + str(i) + "/" + file_name, encoding="utf8")
        json_file = json.load(file)
        annotation_file_list.append(json_file)

total_correct = 0
total = 0

#compare correct annotations to the results from our model
for i in range(len(file_list)):
    data = file_list[i]
    annotations = annotation_file_list[i]
    a, b = pattern_analyzer_compare(data, annotations)
    total += a
    total_correct += b
    
accuracy = total_correct/total
print("Pattern sentiment on BASIL correct: %.3f , BASIL total: %.3f" %(total_correct, total))
print("Pattern sentiment on BASIL ACCURACY: %.4f" % accuracy)

Pattern sentiment on BASIL correct: 1130.000 , BASIL total: 1726.000
Pattern sentiment on BASIL ACCURACY: 0.6547
