In [1]:
pip install textblob





[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from textblob.sentiments import PatternAnalyzer
import json
import os

In [9]:
train = [
     ('I love this sandwich.', 'pos'),
     ('this is an amazing place!', 'pos'),
     ('I feel very good about these beers.', 'pos'),
     ('this is my best work.', 'pos'),
     ("what an awesome view", 'pos'),
     ('I do not like this restaurant', 'neg'),
     ('I am tired of this stuff.', 'neg'),
     ("I can't deal with this", 'neg'),
     ('he is my sworn enemy!', 'neg'),
     ('my boss is horrible.', 'neg')
 ]
test = [
     ('the beer was good.', 'pos'),
     ('I do not enjoy my job', 'neg'),
     ("I ain't feeling dandy today.", 'neg'),
     ("I feel amazing!", 'pos'),
     ('Gary is a friend of mine.', 'pos'),
     ("I can't believe I'm doing this.", 'neg')
 ]

In [10]:
#Polarity score and subjectivity score of a sentence
#return the sentiment score(positive to negative) and
#the subjectivity score (1 is opinion, 0 is fact)
def sentiment_analysis(sentence):
    def get_subjectivity(text):
        return TextBlob(text).sentiment.subjectivity
    
    def get_polarity(text):
        return TextBlob(text).sentiment.polarity
    
    sub = get_subjectivity(sentence)
    pol = get_polarity(sentence)
    return [pol,sub]

#classify the sentiment value into positive, neutral, and negative
def sentiment_classify(sentence):
    score = sentiment_analysis(sentence)
    if score[0] > 0:
        return "pos"
    elif score[0] < 0:
        return "neg"
    else:
        return "neu"

#compare results of data using textblob sentiment compared to correct annotations
def compare(data, annotations): #compares how many we correctly classify
    num_correct = 0
    total = 0
    sentence_list = []
    for i in data['body-paragraphs']:
        for j in i:
            sentence_list.append(j)
    for phrase in annotations["phrase-level-annotations"]:
        if(phrase["id"] == "title"):#line below is where we input our model
            if (sentiment_classify(data["title"]) == phrase["polarity"]):
                num_correct += 1
            total += 1
        else:
            sentence_id = int(phrase["id"][1:])
            if (sentiment_classify(sentence_list[sentence_id]) == phrase["polarity"]):
                num_correct += 1
            total += 1
    return total, num_correct


In [11]:
def tokenize_words(text):
    return TextBlob(text).words

#sentences and TextBlobs are equivalent in properties/methods
def tokenize_sentences(text):
    return TextBlob(text).sentences

def parse_text(text):
    return TextBlob(text).parse()

def end_word_extractor(document):
     tokens = document.split()
     first_word, last_word = tokens[0], tokens[-1]
     feats = {}
     feats["first({0})".format(first_word)] = True
     feats["last({0})".format(last_word)] = False
     return feats

In [12]:
###START OF TESTING CODE
#naive bayes classifier
classifier1 = NaiveBayesClassifier(train)
classifier2 = NaiveBayesClassifier(train, feature_extractor=end_word_extractor)

test_sent1 = "I hate the things that Trump has done over the years."
test_sent2 = "In an Epic Battle of Tanks, Russia Was Routed, Repeating Earlier Mistakes"
test_sent3 = "Ukraine war live updates: Russian mercenary boss says ‘fierce resistance’ seen in Bakhmut; Kyiv says its fighters are under ‘insane pressure’"


print("classifier1 sent1: " + classifier1.classify(test_sent1)) #pos,neutral,neg based on training data
print("classifier2 sent1: " + classifier2.classify(test_sent1)) #classify with added feature extractor

print("classifier1 sent2: " +classifier1.classify(test_sent2))
print("classifier2 sent2: " +classifier2.classify(test_sent2))

print("classifier1 sent3: " +classifier1.classify(test_sent3))
print("classifier2 sent3: " +classifier2.classify(test_sent3))


classifier1 sent1: neg
classifier2 sent1: neg
classifier1 sent2: pos
classifier2 sent2: pos
classifier1 sent3: neg
classifier2 sent3: pos


In [13]:
wiki = TextBlob("Python is a high-level, general-purpose programming language.")
print(wiki.tags)

print(sentiment_analysis(test_sent1))
print(sentiment_analysis(test_sent2))
print(sentiment_analysis(test_sent3))


[('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('high-level', 'JJ'), ('general-purpose', 'JJ'), ('programming', 'NN'), ('language', 'NN')]
[-0.8, 0.9]
[0.05, 0.45]
[-0.2878787878787879, 0.5]


In [14]:
file_list = []

#open BASIL dataset and gather the articles w/o scores
for i in range(10):
    file_i = os.listdir("BASILdata/articles/201" + str(i))
    for file_name in file_i:
        file = open("BASILdata/articles/201" + str(i) + "/" + file_name, encoding="utf8")
        json_file = json.load(file)
        file_list.append(json_file)
        
annotation_file_list = []
#open BASIL dataset and gather annotations for articles
for i in range(10):
    file_i = os.listdir("BASILdata/annotations/201" + str(i))
    for file_name in file_i:
        file = open("BASILdata/annotations/201" + str(i) + "/" + file_name, encoding="utf8")
        json_file = json.load(file)
        annotation_file_list.append(json_file)

total_correct = 0
total = 0
#compare correct annotations to the results from our model
for i in range(len(file_list)):
    data = file_list[i]
    annotations = annotation_file_list[i]
    a, b = compare(data, annotations)
    total += a
    total_correct += b
    
accuracy = total_correct/total
print("TextBlob sentiment on BASIL correct: %.3f , BASIL total: %.3f" %(total_correct, total))
print("TextBlob sentiment on BASIL ACCURACY: %.4f" % accuracy)

TextBlob sentiment on BASIL correct: 592.000 , BASIL total: 1726.000
TextBlob sentiment on BASIL ACCURACY: 0.3430
