-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
213 lines (191 loc) · 9.6 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
####################################################################
# Authors: Shubham * 2, Sohel, Vivek #
# Guide: Dr. Anuradha J. #
# Vellore Institute Of Technology #
# SCOPE #
# Polarity Check #
####################################################################
import os
import sys
import nltk
import pickle
from bootstrapping import Bootstrapping
from pos import SequentialTagger
from hp_classifiers import HpObj, HpSubj
from polarity import PolarityClassifier
from replacer import RepeatReplacer
from terminal_colors import Tcolors
DEBUG = False
class Sentiment:
"""
Sentiment: Analyses the global sentiment of given text regions
that are decomposed to sentences, using bootstrapping methods for
subjectivity and polarity classification. All sub modules except
from POS tagging are learning by experience.
"""
def __init__(self):
self.pos_tagger = SequentialTagger()
self.hp_obj = HpObj(debug=DEBUG)
self.hp_subj = HpSubj(debug=DEBUG)
self.lexicon = self.hp_obj.lexicon
self.bootstrapping = Bootstrapping(self.hp_obj, self.hp_subj, self.pos_tagger, debug=DEBUG)
self.sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.total_sentences = ["good","bad"]
self.total_sentiments = ["positive","negative"]
def analyze(self, clean_text_areas):
"""
Analysis of text regions using the following order: Each sentence per
region is passed from the subjectivity classification using bootstrapping
method and then if it turns out to be subjective it is passed
from the polarity classification using bootstrapping method also.
Finally, it results to a decision for the sentiment of the sentence
and the overall sentiment of the regions.
"""
if len(clean_text_areas) > 0:
for clean_text in clean_text_areas:
# Sentence detection
clean_text = self.normalize(clean_text)
try:
sentences = self.sentence_tokenizer.tokenize(clean_text)
except:
return {}
sentiments = []
scores = []
nscores = []
results = {'positive':{'count' : 0, 'score' : 0, 'nscore' : 0},
'neutral':{'count' : 0, 'score' : 0, 'nscore' : 0},
'negative':{'count' : 0, 'score' : 0, 'nscore' : 0}}
print
print Tcolors.ACT + " Checking block of text:"
for i, sentence in enumerate(sentences):
print "[" + str(i+1) + "] " + sentence
for i, sentence in enumerate(sentences):
# Proceed to subjectivity classification (bootstrapping procedure).
# (This step could be skipped in case you deal with subjective sentences only.)
sentiment = ""
previous = ""
next = ""
score = 0
nscore = 0
if i == 0 and i + 1 < len(sentences):
next = sentences[i+1]
elif i != 0 and i < len(sentences):
if i + 1 != len(sentences):
next = sentences[i+1]
previous = sentences[i-1]
if DEBUG: print Tcolors.ACT + " Analyzing subjectivity..."
result = self.bootstrapping.classify(sentence, previous, next)
if result is None:
res = 'Not found!'
else:
res = result
if DEBUG:
print Tcolors.RES + Tcolors.OKGREEN + " " + res + Tcolors.ENDC
print
# If sentence is subjective
if result == 'subjective' or result is None:
# Proceed to polarity classification
if DEBUG: print Tcolors.ACT + " Analyzing sentiment..."
polarity_classifier = PolarityClassifier(self.pos_tagger, self.lexicon, debug=DEBUG)
sentiment, score, nscore = polarity_classifier.classify(sentence)
if DEBUG: print Tcolors.RES + Tcolors.OKGREEN + " " + sentiment + Tcolors.ENDC
# If sentence is objective
elif result == 'objective':
sentiment = 'neutral'
# Collect high-confidence training instances for SVM classifier.
# After the training, SVM can be used to classify new sentences.
#if sentiment != "neutral" and sentiment != "":
#if sentiment != "neutral" and abs(nscore) >= 0.4:
# self.total_sentences.append(sentence)
# self.total_sentiments.append(sentiment)
# Store results to memory
sentiments.append(sentiment)
scores.append(score)
nscores.append(nscore)
# Update score
if results.has_key(sentiment):
results[sentiment]['nscore'] += nscore
results[sentiment]['score'] += score
results[sentiment]['count'] += 1
print
print Tcolors.ACT + " Overall sentiment analysis:"
print Tcolors.BGH
print " Parts: ", len(sentences)
print " Sentiments: ", sentiments
print " Scores: ", scores
print " Results: ", "},\n\t ".join((str)(results).split("}, "))
print Tcolors.C
pcount = results['positive']['count']
ncount = results['negative']['count']
total = len(sentences)
print Tcolors.BG
print " subjective".ljust(16,"-") + "> %.2f" % ((float)(pcount + ncount)*100 / total) + "%"
print " objective".ljust(16,"-") + "> %.2f" % (100 - ((float)(pcount + ncount)*100 / total)) + "%"
print Tcolors.C
print Tcolors.BGGRAY
for sense in results.keys():
count = results[sense]['count']
percentage = (float)(count) * 100 / (len(sentences))
print " " +sense.ljust(15,"-")+"> %.2f" % (percentage) + "%"
print Tcolors.C
ssum = sum(scores)
confidence = " (%.2f, %.2f)" % (ssum,sum(nscores))
final_sent = ""
pos = True
if results["negative"]["count"] > len(sentences)*1.0/3:
pos = False
# Print total sentiment score and normalized sentiment score
if ssum > 0 and pos:
print Tcolors.RES + Tcolors.OKGREEN + " positive" + confidence + Tcolors.C
final_sent = "positive"
elif ssum == 0:
print Tcolors.RES + Tcolors.OKGREEN + " neutral" + confidence + Tcolors.C
final_sent = "neutral"
else:
print Tcolors.RES + Tcolors.OKGREEN + " negative" + confidence + Tcolors.C
final_sent = "negative"
print Tcolors.C
# Store results
total_result_hash = {'sentences' : sentences,
'sentiments': sentiments,
'scores' : scores,
'nscores' : nscores,
'results' : results,
'final' : {final_sent:{'score':ssum,'nscore':sum(nscores)}}}
# Train SVM classifier
# self.train_svm()
return total_result_hash
def normalize(self, text):
"""
Make some word improvements before feeding to the sentence tokenizer.
"""
rr = RepeatReplacer(self.lexicon)
normalized_text = []
final = None
try:
for word in text.split():
normal = rr.replace(word.lower())
if word[0].isupper():
normal = normal[0].upper() + normal[1:]
normalized_text.append(normal)
final = " ".join(normalized_text)
except:
final = text
return final
def train_svm(self):
"""
Train SVM and store data with pickle.
"""
self.svm.train(self.total_sentences, self.total_sentiments)
t_output = open(self.svm_train_filename,'wb')
l_output = open(self.svm_label_filename,'wb')
pickle.dump(self.total_sentences,t_output)
pickle.dump(self.total_sentiments,l_output)
t_output.close()
l_output.close()
if __name__ == '__main__':
sentiment = Sentiment()
if len(sys.argv) > 1:
sentiment.analyze([sys.argv[1]])
else:
sentiment.analyze([u"Behave Dude!!!"])