In [69]:
import nltk
# download senseval package
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

In [70]:
from nltk.corpus import sentiwordnet as swn

In [71]:
print(list(swn.senti_synsets('joy')))

[SentiSynset('joy.n.01'), SentiSynset('joy.n.02'), SentiSynset('rejoice.v.01'), SentiSynset('gladden.v.01')]


In [72]:
print(list(swn.senti_synsets('trouble')))

[SentiSynset('trouble.n.01'), SentiSynset('fuss.n.02'), SentiSynset('trouble.n.03'), SentiSynset('trouble.n.04'), SentiSynset('worry.n.02'), SentiSynset('trouble.n.06'), SentiSynset('disturb.v.01'), SentiSynset('trouble.v.02'), SentiSynset('perturb.v.01'), SentiSynset('trouble_oneself.v.01'), SentiSynset('trouble.v.05')]


Obtaining synset and poloarity

In [73]:
joy1 = swn.senti_synset('joy.n.01')
#print(str(joy1.pos_score),str(joy1.neg_score),str(joy1.obj_score))
joy2 = swn.senti_synset('joy.n.02')
trouble1 = swn.senti_synset('trouble.n.03')
trouble2 = swn.senti_synset('trouble.n.04')
categories = ["Joy1", "Joy2", "Trouble1", "Trouble2"]
rows = []
rows.append(["List","Positive score","Negative score"])
accs = {}
accs["Joy1"] = [joy1.pos_score(), joy1.neg_score()]
accs["Joy2"] = [joy2.pos_score(), joy2.neg_score()]
accs["Trouble1"] = [trouble1.pos_score(), trouble1.neg_score()]
accs["Trouble2"] = [trouble2.pos_score(), trouble2.neg_score()]
accs

{'Joy1': [0.5, 0.25],
 'Joy2': [0.375, 0.0],
 'Trouble1': [0.0, 0.625],
 'Trouble2': [0.0, 0.5]}

pretty printing synset polarity

In [74]:
for cat in categories:
  rows.append([cat,f"{accs.get(cat)[0]:0.3f}",f"{accs.get(cat)[1]:0.3f}"])
print(rows)

[['List', 'Positive score', 'Negative score'], ['Joy1', '0.500', '0.250'], ['Joy2', '0.375', '0.000'], ['Trouble1', '0.000', '0.625'], ['Trouble2', '0.000', '0.500']]


In [82]:
def pretty_print(rows):
  columns = zip(*rows)
  column_widths = [max(len(item) for item in col) for col in columns]
  for row in rows:
    print(' '.join(' {:{width}} '.format(row[i], width=column_widths[i]) for i in range(0, len(row))))

In [84]:
pretty_print(rows)

 List       Positive score   Negative score 
 Joy1       0.500            0.250          
 Joy2       0.375            0.000          
 Trouble1   0.000            0.625          
 Trouble2   0.000            0.500          


Code to access synsets for a specific part of speech

In [None]:
synsets = swn.senti_synsets('terrific','a')
synsets

<filter at 0x7f254f7b3710>

In [None]:
for synset in synsets:
  print("synset: "+str(synset) + " pos: +" + str(synset.pos_score()) + "  neg: -" + str(synset.neg_score()))

synset: <terrific.s.01: PosScore=0.25 NegScore=0.25> pos: +0.25  neg: -0.25
synset: <fantastic.s.02: PosScore=0.75 NegScore=0.0> pos: +0.75  neg: -0.0
synset: <terrific.s.03: PosScore=0.0 NegScore=0.625> pos: +0.0  neg: -0.625


Aggregate sentiment score based on sentiwordnet

In [None]:
# Listing 8.4 Code to aggregate sentiment scores based on SentiWordNet
from nltk.corpus import wordnet as wn
 
def convert_tags(pos_tag):
    if pos_tag.startswith("JJ"):
         return wn.ADJ
    elif pos_tag.startswith("NN"):
         return wn.NOUN
    elif pos_tag.startswith("RB"):
         return wn.ADV
    elif pos_tag.startswith("VB") or pos_tag.startswith("MD"):
         return wn.VERB
    return None
 
def swn_decisions(a_dict, label):  
    decisions = []
    for rev_id in a_dict.keys():
        score = 0
        neg_count = 0
        pos_count = 0
        for token in a_dict.get(rev_id):
            wn_tag = convert_tags(token.tag_)
            if wn_tag in (wn.ADJ, wn.ADV, wn.NOUN, wn.VERB):
                synsets = list(swn.senti_synsets(token.lemma_, pos=wn_tag))
                if len(synsets)>0:
                    temp_score = 0.0
                    for synset in synsets:
                        temp_score += synset.pos_score() - synset.neg_score()
                    score += temp_score/len(synsets)
        if score < 0:
            decisions.append((-1, label)) 
        else:
            decisions.append((1, label))
    return decisions

In [None]:
# Listing 8.5 Code to evaluate the results for this approach
def get_swn_accuracy(pos_docs, neg_docs):
    decisions_pos = swn_decisions(pos_docs, 1)   
    decisions_neg = swn_decisions(neg_docs, -1)
    decisions_all = decisions_pos + decisions_neg
    lists = [decisions_pos, decisions_neg, decisions_all]
    accuracies = []
    for i in range(0, len(lists)):
        match = 0
        for item in lists[i]:
            if item[0]==item[1]:
                match += 1
        accuracies.append(float(match)/float(len(lists[i])))
    return accuracies
 


get positive documents and negative documents BEFORE running this block

In [119]:
accuracies = get_swn_accuracy(pos_docs, neg_docs) 

rows = []
rows.append(["List", "Acc(positive)", "Acc(negative)", "Acc(all)"])
rows.append(["SentiWordNet", f"{accuracies[0]:.6f}",
                f"{accuracies[1]:.6f}",
                f"{accuracies[2]:.6f}"])
 
pretty_print(rows)

 List           Acc(positive)   Acc(negative)   Acc(all) 
 SentiWordNet   0.741294        0.681592        0.711443 


In [121]:
# Listing 8.6 Code to filter the content of the reviews and prepare it for feature extraction
import random
import string
from spacy.lang.en.stop_words import STOP_WORDS as stopwords_list
punctuation_list = [punct for punct in string.punctuation]
 
def text_filter(a_dict, label, exclude_lists):
    data = []
    for rev_id in a_dict.keys():
        tokens = []
        for token in a_dict.get(rev_id):
            if not token.text in exclude_lists:
                tokens.append(token.text)
        data.append((' '.join(tokens), label))
    return data
 
def prepare_data(pos_docs, neg_docs, exclude_lists):
    data = text_filter(pos_docs, 1, exclude_lists)
    data += text_filter(neg_docs, -1, exclude_lists)
    random.seed(42)
    random.shuffle(data)
    texts = []
    labels = []
    for item in data:
        texts.append(item[0])
        labels.append(item[1])
    return texts, labels
 

In [122]:
texts, labels = prepare_data(pos_docs, neg_docs, punctuation_list)
 
print(len(texts), len(labels))
print(texts[0])

402 402
near the end of 1996 analysts remarked that the year would be the last in which disney would stand alone as the animation king the following year 1997 saw the first sleeper entry into the genre from a studio other than the magic kingdom twentieth century fox gave baronial russia to the masses in the form of anastasia at best that 's all it was -- a sleeper hit designed to surprise but not shake the industry in 1998 however things have been decidedly different there have been a number of pushes on disney 's death grip on the market and fifty one weeks into 98 a new champion has emerged dreamworks the prince of egypt succeeds where all of the other conventionally animated products have failed the other two contenders warner brothers quest for camelot and disney 's own annual entry mulan lacked inspiration and drive respectively although camelot was borderline abysmal and ca n't really be considered a shot at the title mulan had the makings of winner unfortunately it was saturated

In [124]:
# Listing 8.7 Code to split the data into the training and test sets
def split(texts, labels, proportion):
    train_data = []
    train_targets = []
    test_data = []
    test_targets = []
    for i in range(0, len(texts)):
        if i < proportion*len(texts):
            train_data.append(texts[i])
            train_targets.append(labels[i])
        else:
            test_data.append(texts[i])
            test_targets.append(labels[i])
    return train_data, train_targets, test_data, test_targets
 

In [126]:
train_data, train_targets, test_data, test_targets = split(texts, labels, 0.8)
       
print(len(train_data))
print(len(train_targets))
print(len(test_data))
print(len(test_targets))
print(train_targets[10:20])
print(test_targets[10:20])

322
322
80
80
[1, -1, 1, 1, 1, 1, 1, -1, -1, -1]
[-1, 1, 1, 1, 1, 1, -1, 1, -1, -1]


In [130]:
# Listing 8.8 Code to apply CountVectorizer to learn the features on the training set
from sklearn.feature_extraction.text import CountVectorizer
 
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_data)
print(train_counts.shape)
print(train_counts[0])

(322, 17572)
  (0, 10407)	1
  (0, 15681)	60
  (0, 5163)	1
  (0, 10780)	24
  (0, 93)	1
  (0, 705)	1
  (0, 12783)	1
  (0, 15680)	8
  (0, 17466)	2
  (0, 17398)	2
  (0, 1416)	5
  (0, 8865)	1
  (0, 7854)	13
  (0, 17159)	1
  (0, 4487)	6
  (0, 14769)	1
  (0, 608)	1
  (0, 986)	3
  (0, 749)	3
  (0, 8687)	1
  (0, 6110)	1
  (0, 94)	1
  (0, 13498)	1
  (0, 5963)	2
  (0, 14275)	2
  :	:
  (0, 10010)	1
  (0, 13418)	1
  (0, 14653)	1
  (0, 8637)	1
  (0, 3499)	1
  (0, 5870)	1
  (0, 4757)	1
  (0, 11012)	1
  (0, 5478)	1
  (0, 696)	1
  (0, 10813)	1
  (0, 3707)	1
  (0, 16278)	1
  (0, 17194)	2
  (0, 17469)	1
  (0, 12756)	1
  (0, 2811)	1
  (0, 17239)	1
  (0, 7525)	1
  (0, 15760)	1
  (0, 9436)	1
  (0, 1607)	1
  (0, 2272)	1
  (0, 6821)	1
  (0, 17034)	1


In [131]:
count_vect.get_feature_names()[10407]

'near'

In [132]:
count_vect.get_feature_names()[15681]

'the'

In [133]:
print(count_vect.inverse_transform(train_data))

[array(['00', '000', '0009f', '10', '100', '1000', '101', '102', '105',
       '107', '109', '10b', '11', '111', '112', '117', '12', '126', '13',
       '137', '13th', '14', '15', '1521', '16', '160', '161', '1692',
       '17', '175', '1799', '17th', '18', '1812', '1888', '1896', '18th',
       '19', '1900', '1900s', '1925', '1928', '1930', '1930s', '1933',
       '1938', '1939', '1940', '1942', '1944', '1947', '1948', '1949',
       '1950s', '1959', '1960', '1960s', '1961', '1962', '1964', '1966',
       '1967', '1968', '1969', '1970', '1970s', '1971', '1972', '1973',
       '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1980s',
       '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1990s', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1998s', '1999', '19th', '20', '200',
       '2000', '2001', '2002', '2020', '2029', '2056', '20th', '21',
       '2176', '22', '24', '25', '254', '27', '29', '2d', '30', '300',

In [136]:
# Listing 8.9 Code to apply CountVectorizer to test set and run classification
from sklearn.naive_bayes import MultinomialNB
 
clf = MultinomialNB().fit(train_counts, train_targets)
 
test_counts = count_vect.transform(test_data)
predicted = clf.predict(test_counts)
 

In [140]:
for text, label in list(zip(test_data, predicted))[10:20]:
    if label==1:
        print('%r => %s' % (text[:100], "pos"))
    else:
        print('%r => %s' % (text[:100], "neg"))

'this is your definitive hollywood movie extremely predictable following the basic formula a horrible' => neg
'warren beatty returns to the screens in the funniest craziest and hard hitting movie in his career b' => pos
'capsule a short punchy action sequel to the two dinosaur films made by steven spielberg joe johnston' => neg
'when bulworth ended i allowed myself a sigh of relief it is possible for me to enjoy political satir' => pos
'unzipped is a cinematic portrait of isaac mizrahi an artist whose palette is fabric ostensibly the f' => pos
'meet joe black reviewed on nov 27/98 starring brad pitt anthony hopkins claire forlani in meet joe b' => neg
'a documentary from the twin hughes brothers allen and albert dead presidents menace ii society about' => pos
'plot jet li is a chinese cop asked to help some french policemen nab a bad guy but one thing leads t' => neg
'forget get carter instead get me a cup of coffee what the hell has happened to all good american act' => neg
'starting 

In [141]:
# Listing 8.10 Code to define Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer
 
text_clf = Pipeline([('vect', CountVectorizer(min_df=10, max_df=0.5)),
                     ('binarizer', Binarizer()),
                     ('clf', MultinomialNB()),
                    ])
 
text_clf.fit(train_data, train_targets) 
print(text_clf)
predicted = text_clf.predict(test_data)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.5,
                                 max_features=None, min_df=10,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('binarizer', Binarizer(copy=True, threshold=0.0)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)


In [142]:
# Listing 8.11 Code to evaluate performance of your Pipeline
from sklearn import metrics
 
print("\nConfusion matrix:")
print(metrics.confusion_matrix(test_targets, predicted))
print(metrics.classification_report(test_targets, predicted))


Confusion matrix:
[[28  8]
 [13 31]]
              precision    recall  f1-score   support

          -1       0.68      0.78      0.73        36
           1       0.79      0.70      0.75        44

    accuracy                           0.74        80
   macro avg       0.74      0.74      0.74        80
weighted avg       0.74      0.74      0.74        80



In [143]:
# Listing 8.12 Code to run k-fold cross-validation
from sklearn.model_selection import cross_val_score, cross_val_predict
 
scores = cross_val_score(text_clf, texts, labels, cv=10)
print(scores)
print("Accuracy: " + str(sum(scores)/10))
predicted = cross_val_predict(text_clf, texts, labels, cv=10)
print("\nConfusion matrix:")
print(metrics.confusion_matrix(labels, predicted))
print(metrics.classification_report(labels, predicted))

[0.82926829 0.82926829 0.675      0.775      0.925      0.65
 0.75       0.775      0.625      0.9       ]
Accuracy: 0.7733536585365854

Confusion matrix:
[[158  43]
 [ 48 153]]
              precision    recall  f1-score   support

          -1       0.77      0.79      0.78       201
           1       0.78      0.76      0.77       201

    accuracy                           0.77       402
   macro avg       0.77      0.77      0.77       402
weighted avg       0.77      0.77      0.77       402



In [144]:
# Listing 8.13 Code to update the Pipeline with ngram features
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('binarizer', Binarizer()),
                     ('clf', MultinomialNB())
                    ])
 
scores = cross_val_score(text_clf, texts, labels, cv=10)
print(scores)
print("Accuracy: " + str(sum(scores)/10))
predicted = cross_val_predict(text_clf, texts, labels, cv=10)
print("\nConfusion matrix:")
print(metrics.confusion_matrix(labels, predicted))
print(metrics.classification_report(labels, predicted))

[0.85365854 0.87804878 0.775      0.775      0.725      0.725
 0.775      0.8        0.7        0.825     ]
Accuracy: 0.7831707317073171

Confusion matrix:
[[160  41]
 [ 46 155]]
              precision    recall  f1-score   support

          -1       0.78      0.80      0.79       201
           1       0.79      0.77      0.78       201

    accuracy                           0.78       402
   macro avg       0.78      0.78      0.78       402
weighted avg       0.78      0.78      0.78       402



In [167]:
neg_docs.items()



In [165]:
# detecting and put NEG on words with negation front
for sent in neg_docs.get('cv000_29416').sents:
  neg_tokens = mark_negation(sent.text.split())
  for token in neg_tokens:
    if not token in punctuation_list:
       print(token)

plot
two
teen
couples
go
to
a
church
party
drink
and
then
drive
they
get
into
an
accident
one
of
the
guys
dies
but
his
girlfriend
continues
to
see
him
in
her
life
and
has
nightmares
what's
the
deal
watch
the
movie
and
sorta
find
out
critique
a
mind-fuck
movie
for
the
teen
generation
that
touches
on
a
very
cool
idea
but
presents
it
in
a
very
bad
package
which
is
what
makes
this
review
an
even
harder
one
to
write
since
i
generally
applaud
films
which
attempt
to
break
the
mold
mess
with
your
head
and
such
lost
highway
memento
but
there
are
good
and
bad
ways
of
making
all
types
of
films
and
these
folks
just
didn't
snag_NEG
this_NEG
one_NEG
correctly_NEG
they
seem
to
have
taken
this
pretty
neat
concept
but
executed
it
terribly
so
what
are
the
problems
with
the
movie
well
its
main
problem
is
that
it's
simply
too
jumbled
it
starts
off
normal
but
then
downshifts
into
this
fantasy
world
in
which
you
as
an
audience
member
have
no
idea_NEG
what's_NEG
going_NEG
on_NEG
there
are
dreams
there
are
ch

In [152]:
# Listing 8.14 Code to add negation handling to your text prepricessing
from nltk.sentiment.util import mark_negation
 
def text_filter_neg(a_dict, label, exclude_lists):
    data = []
    for rev_id in a_dict.keys():
        tokens = []
        for sent in a_dict.get(rev_id).sents:
            neg_tokens = mark_negation(sent.text.split())
            for token in neg_tokens:
                if not token in exclude_lists:
                    tokens.append(token)
        data.append((' '.join(tokens), label))
    return data
 
def prepare_data_neg(pos_docs, neg_docs, exclude_lists):
    data = text_filter_neg(pos_docs, 1, exclude_lists)
    data += text_filter_neg(neg_docs, -1, exclude_lists)
    random.seed(33)
    random.shuffle(data)
    texts = []
    labels = []
    for item in data:
        texts.append(item[0])
        labels.append(item[1])
    return texts, labels
 

In [153]:
texts_neg, labels_neg = prepare_data_neg(pos_docs, neg_docs, punctuation_list)
print(len(texts_neg), len(labels_neg))

402 402


In [155]:
print(texts_neg[0])

at first glance it appears that the home alone movies are brainless slapstick intended for those with minds of 8-year olds that's true of home alone 2 and i'd bet money that it's true of home alone 3 opening soon but home alone actually has a lot going for it and the cartoon slapstick doesn't get_NEG in_NEG the_NEG way_NEG the mcalisters all 15 of them are going to france for the holidays four adults and eleven kids are spending the night together before heading to the airport en masse the littlest kid kevin is the victim of the older kids' cruelty kevin's patience runs out when he learns that his plain-cheese pizza has already been eaten and he's going to have to starve he attacks his big bully brother buzz who had been teasing him the fight disrupts the already-chaotic dinner spilling milk and soft drinks all over the table a few cousins and an uncle the knee-jerk reaction is for everyone to blame kevin nobody came to his rescue when he was being teased but they all point their finge

In [168]:
# Listing 8.15 Code to update the Pipeline and run the classifier
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('binarizer', Binarizer()),
                     ('clf', MultinomialNB())
                    ])
 
scores = cross_val_score(text_clf, texts, labels, cv=10)
print(scores)
print("Accuracy: " + str(sum(scores)/10))
predicted = cross_val_predict(text_clf, texts, labels, cv=10)
print("\nConfusion matrix:")
print(metrics.confusion_matrix(labels, predicted))
print(metrics.classification_report(labels, predicted))

[0.85365854 0.87804878 0.775      0.775      0.725      0.725
 0.775      0.8        0.7        0.825     ]
Accuracy: 0.7831707317073171

Confusion matrix:
[[160  41]
 [ 46 155]]
              precision    recall  f1-score   support

          -1       0.78      0.80      0.79       201
           1       0.79      0.77      0.78       201

    accuracy                           0.78       402
   macro avg       0.78      0.78      0.78       402
weighted avg       0.78      0.78      0.78       402



In [None]:
!gdown --id 1foAL8yvkyZ8nc3rjdIIYcpy8eyvs8Pus


In [54]:
# Listing 7.1 Code to read in the positive and negative movie reviews
import os, codecs
 
def read_in(folder):
    files = os.listdir(folder)
    a_dict = {}
    for a_file in sorted(files):
        if not a_file.startswith("."):
            with codecs.open(folder + a_file, encoding='ISO-8859-1', errors ='ignore') as f:
                file_id = a_file.split(".")[0].strip()
                a_dict[file_id] = f.read()
            f.close()
    return a_dict

In [None]:
# Listing 7.2 Code to initialize two Python dictionaries for the reviews of different polarity
# folder = "review_polarity/txt_sentoken/"
folder = "txt_sentoken/"
pos_dict = read_in(folder + "pos/")
print(len(pos_dict))
print(pos_dict.get(next(iter(pos_dict))))
neg_dict = read_in(folder + "neg/")
print(len(neg_dict))
print(neg_dict.get(next(iter(neg_dict))))

In [64]:
# Listing 7.3 Code to calculate statistics on the review dataset
def tokenize(text):
    text.replace("\n", " ")
    return text.split()
 
def statistics(a_dict):
    length = 0
    sent_length = 0
    num_sents = 0
    vocab = []
    for review in a_dict.values():
        length += len(tokenize(review))
        sents = review.split("\n")
        num_sents += len(sents)
        for sent in sents:
            sent_length += len(tokenize(sent))
        vocab += tokenize(review)
    avg_length = float(length)/len(a_dict)
    avg_sent_length = float(sent_length)/num_sents
    vocab_size = len(set(vocab))
    diversity = float(length)/float(vocab_size)
    return avg_length, avg_sent_length, vocab_size, diversity


In [85]:
categories = ["Positive", "Negative"]
rows = []
rows.append(["Category", "Avg_Len(Review)", "Avg_Len(Sent)", "Vocabulary Size", "Diversity"])
stats = {}
stats["Positive"] = statistics(pos_dict)
stats["Negative"] = statistics(neg_dict)
for cat in categories:
    rows.append([cat, f"{stats.get(cat)[0]:.6f}",
                f"{stats.get(cat)[1]:.6f}",
                f"{stats.get(cat)[2]:.6f}",
                f"{stats.get(cat)[3]:.6f}"])
 
pretty_print(rows)

 Category   Avg_Len(Review)   Avg_Len(Sent)   Vocabulary Size   Diversity 
 Positive   737.407960        23.076288       15454.000000      9.590980  
 Negative   697.885572        21.324871       14842.000000      9.451220  


In [86]:
# Listing 7.4 Code to measure the difference between two lists of words
def vocab_difference(list1, list2):
    vocab1 = []
    vocab2 = []
    for rev in list1:
        vocab1 += tokenize(rev)
    for rev in list2:
        vocab2 += tokenize(rev)
    return sorted(list(set(vocab1) - set(vocab2)))
 

In [87]:
pos_wordlist = pos_dict.values()
neg_wordlist = neg_dict.values()
 
print(vocab_difference(pos_wordlist, neg_wordlist)[1500:1600])
print(vocab_difference(neg_wordlist, pos_wordlist)[1500:1600])
print()
print(str(len(vocab_difference(pos_wordlist, neg_wordlist))) + " unique words in positive reviews only")
print(str(len(vocab_difference(neg_wordlist, pos_wordlist))) + " unique words in negative reviews only")

['compositions', 'comprehension', 'comprehensive', 'compressed', 'compunction', 'computer-animated', 'computer-simulated', 'comradeship', 'conan', 'concealed', 'conceit', 'conceivable', 'conceivably', 'conceive', 'concentrated', 'concentrates', 'concentration', 'conciseness', 'conclusion--the', 'concocting', 'concocts', 'concordia', 'condense', 'condescending', 'condo', 'condom', 'condominium', 'condor', 'condors', 'conducted', 'conelly', 'confers', 'confident', 'confidentiality', 'confidently', 'confides', 'confirmed', 'confiscate', 'conformity', 'confrontational', 'confrontatory', 'cong', 'congo', "congo's", 'congolese', 'congressman', 'congresswoman', 'conjugated', 'conjure', 'conjuring', 'connecticut', 'connelly', "connery's", 'connick', 'connives', 'conniving', 'connotations', 'conrad', 'conread', 'cons', 'conscience-deprived', 'consensus', 'consent', 'consequence', 'consequently', 'conservative--he', 'considerable', 'consisted', 'conspiracies', 'constellations', 'construct', "con

In [92]:
# import spacy
!python -m spacy download en_core_web_md
import spacy
#from spacy import displacy
import en_core_web_md

#nlp = spacy.load('en_core_web_md')
#nlp = spacy.load("en_core_web_md")
nlp = spacy.load("en")


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [96]:
# Listing 7.5 Code run spaCy’s linguistic pipeline and store the results
def spacy_preprocess_reviews(source):
    source_docs = {}
    index = 0
    for review_id in source.keys():
        source_docs[review_id] = nlp(source.get(review_id).replace("\n", ""))
        if index>0 and (index%25)==0:
            print(str(index) + " reviews processed")
        index += 1
    print("Dataset processed")
    return source_docs

In [97]:
pos_docs = spacy_preprocess_reviews(pos_dict)


25 reviews processed
50 reviews processed
75 reviews processed
100 reviews processed
125 reviews processed
150 reviews processed
175 reviews processed
200 reviews processed
Dataset processed


In [98]:
neg_docs = spacy_preprocess_reviews(neg_dict)

25 reviews processed
50 reviews processed
75 reviews processed
100 reviews processed
125 reviews processed
150 reviews processed
175 reviews processed
200 reviews processed
Dataset processed


In [100]:
# Listing 7.6 Code to calculate statistics on word lemmas
def statistics_lem(source_docs):
    length = 0
    vocab = []
    for review_id in source_docs.keys():
        review_doc = source_docs.get(review_id)
        lemmas = []
        for token in review_doc:
            lemmas.append(token.lemma_)
        length += len(lemmas)
        vocab += lemmas
    avg_length = float(length)/len(source_docs)
    vocab_size = len(set(vocab))
    diversity = float(length)/float(vocab_size)
    return avg_length, vocab_size, diversity
       
 
#columns = zip(*rows)
#column_widths = [max(len(item) for item in col) for col in columns]
#for row in rows:
#   print(''.join(' {:{width}} '.format(row[i], width=column_widths[i])
#                  for i in range(0, len(row))))

In [101]:
categories = ["Positive", "Negative"]
rows = []
rows.append(["Category", "Avg_Len(Review)", "Vocabulary Size", "Diversity"])
stats = {}
stats["Positive"] = statistics_lem(pos_docs)
stats["Negative"] = statistics_lem(neg_docs)
for cat in categories:
    rows.append([cat, f"{stats.get(cat)[0]:.6f}",
                f"{stats.get(cat)[1]:.6f}",
                f"{stats.get(cat)[2]:.6f}"])

pretty_print(rows)

 Category   Avg_Len(Review)   Vocabulary Size   Diversity 
 Positive   768.039801        11507.000000      13.415834 
 Negative   729.850746        11025.000000      13.306122 


In [102]:
# Listing 7.7 Code to detect the non-overlapping lemmas between two types of reviews
def vocab_lem_difference(source_docs1, source_docs2):
    vocab1 = []
    vocab2 = []
    for rev_id in source_docs1.keys():
        rev = source_docs1.get(rev_id)
        for token in rev:
            vocab1.append(token.lemma_)
    for rev_id in source_docs2.keys():
        rev = source_docs2.get(rev_id)
        for token in rev:
            vocab2.append(token.lemma_)
    return sorted(list(set(vocab1) - set(vocab2)))
 

In [103]:
print(str(len(vocab_lem_difference(pos_docs, neg_docs))) + " unique lemmas in positive reviews only")
print(str(len(vocab_lem_difference(neg_docs, pos_docs))) + " unique lemmas in negative reviews only")

5040 unique lemmas in positive reviews only
4558 unique lemmas in negative reviews only


In [107]:
# Listing 7.8 Code to populate sentiment word dictionaries with sentiment values
def collect_wordlist(input_file):
    word_dict = {}
    with codecs.open(input_file, encoding='ISO-8859-1', errors ='ignore') as f:
        for a_line in f.readlines():
            cols = a_line.split("\t")
            if len(cols)>2:
                word = cols[0].strip()
                score = float(cols[1].strip())
                word_dict[word] = score
    f.close()
    return word_dict

In [114]:
adj_90 = collect_wordlist("sentiment_words/adjectives/1990.tsv")
print("adj_1990_cool " + str(adj_90.get("cool")))
print("adj_1990 len = " + str(len(adj_90)))
adj_00 = collect_wordlist("sentiment_words/adjectives/2000.tsv")
print("adj_2000_cool " + str(adj_00.get("cool")))
print("adj_2000 len = " + str(len(adj_00)))
all_90 = collect_wordlist("sentiment_words/frequent_words/1990.tsv")
print("all 1990 len = "+ str(len(all_90)))
all_00 = collect_wordlist("sentiment_words/frequent_words/2000.tsv")
print("all 2000 len = " + str(len(all_00)))
movie_words = collect_wordlist("sentiment_words/subreddits/movies.tsv")
print("movies len = " + str(len(movie_words)))

adj_1990_cool 1.28
adj_1990 len = 1968
adj_2000_cool 1.19
adj_2000 len = 2041
all 1990 len = 4924
all 2000 len = 4924
movies len = 4981


In [115]:
# Listing 7.9 Code to apply and evaluate the sentiment lexicon-based approach
def bin_decisions(a_dict, label, sent_dict):
    decisions = []
    for rev_id in a_dict:
        score = 0
        for token in a_dict.get(rev_id):
            if token.text in sent_dict.keys():
                if sent_dict.get(token.text)<0:
                    score -= 1
                else:
                    score += 1
        if score < 0:
            decisions.append((-1, label))
        else:
            decisions.append((1, label))
    return decisions
 
def weighted_decisions(a_dict, label, sent_dict):
    decisions = []
    for rev_id in a_dict:
        score = 0
        for token in a_dict.get(rev_id):
            if token.text in sent_dict.keys():
                score += sent_dict.get(token.text)
        if score < 0:
            decisions.append((-1, label))
        else:
            decisions.append((1, label))
    return decisions
 
def get_accuracy(pos_docs, neg_docs, sent_dict):
    decisions_pos = weighted_decisions(pos_docs, 1, sent_dict)
    decisions_neg = weighted_decisions(neg_docs, -1, sent_dict)
    decisions_all = decisions_pos + decisions_neg
    lists = [decisions_pos, decisions_neg, decisions_all]
    accuracies = []
    for i in range(0, len(lists)):
        match = 0
        for item in lists[i]:
            if item[0]==item[1]:
                match += 1
        accuracies.append(float(match)/float(len(lists[i])))
    return accuracies
 

In [118]:
categories = ["Adj_90", "Adj_00", "All_90", "All_00", "Movies"]
rows = []
rows.append(["List", "Acc(positive)", "Acc(negative)", "Acc(all)"])
accs = {}
accs["Adj_90"] = get_accuracy(pos_docs, neg_docs, adj_90)
accs["Adj_00"] = get_accuracy(pos_docs, neg_docs, adj_00)
accs["All_90"] = get_accuracy(pos_docs, neg_docs, all_90)
accs["All_00"] = get_accuracy(pos_docs, neg_docs, all_00)
accs["Movies"] = get_accuracy(pos_docs, neg_docs, movie_words)
for cat in categories:
    rows.append([cat, f"{accs.get(cat)[0]:.6f}",
                f"{accs.get(cat)[1]:.6f}",
                f"{accs.get(cat)[2]:.6f}"])
 
pretty_print(rows)

 List     Acc(positive)   Acc(negative)   Acc(all) 
 Adj_90   0.800995        0.512438        0.656716 
 Adj_00   0.810945        0.432836        0.621891 
 All_90   0.995025        0.029851        0.512438 
 All_00   0.800995        0.398010        0.599502 
 Movies   0.009950        1.000000        0.504975 
