# Lexicon Polarity
## Norma Grubb and Justin Gould
## April 2021

# Required Packages

In [157]:
import nltk
from nltk.corpus import sentence_polarity
import random
import Subjectivity

from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer

import dill as pickle

### Download Corpus

In [3]:
nltk.download('sentence_polarity')

[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /Users/gould29/nltk_data...
[nltk_data]   Unzipping corpora/sentence_polarity.zip.


True

# Get the sentence corpus and look at sample sentences

In [4]:
# get the sentence corpus and look at some sentences
sentences = sentence_polarity.sents()
print(len(sentences))
print(type(sentences))
print(sentence_polarity.categories())
# sentences are already tokenized, print the first four sentences
for sent in sentences[:4]:
    print(sent)

10662
<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
['neg', 'pos']
['simplistic', ',', 'silly', 'and', 'tedious', '.']
["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.']
['exploitative', 'and', 'largely', 'devoid', 'of', 'the', 'depth', 'or', 'sophistication', 'that', 'would', 'make', 'watching', 'such', 'a', 'graphic', 'treatment', 'of', 'the', 'crimes', 'bearable', '.']
['[garbus]', 'discards', 'the', 'potential', 'for', 'pathological', 'study', ',', 'exhuming', 'instead', ',', 'the', 'skewed', 'melodrama', 'of', 'the', 'circumstantial', 'situation', '.']


In [5]:
# look at the sentences by category to see how many positive and negative
pos_sents = sentence_polarity.sents(categories='pos')
print(len(pos_sents))
neg_sents = sentence_polarity.sents(categories='neg')
print(len(neg_sents))

5331
5331


# Set up Classification

In [6]:
## setup the movie reviews sentences for classification
# create a list of documents, each document is one sentence as a list of words paired with category
documents = [(sent, cat) for cat in sentence_polarity.categories() 
	for sent in sentence_polarity.sents(categories=cat)]

In [7]:
# look at the first and last documents - consists of all the words in the review
# followed by the category
print(documents[0])
print(documents[-1])
# randomly reorder documents
random.shuffle(documents)

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'neg')
(['provides', 'a', 'porthole', 'into', 'that', 'noble', ',', 'trembling', 'incoherence', 'that', 'defines', 'us', 'all', '.'], 'pos')


# get all words from all movie_reviews and put into a frequency distribution

In [8]:
#   note lowercase, but no stemming or stopwords
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word,count) in word_items]
print(word_features[:50])

['.', 'the', ',', 'a', 'and', 'of', 'to', 'is', 'in', 'that', 'it', 'as', 'but', 'with', 'film', 'this', 'for', 'its', 'an', 'movie', "it's", 'be', 'on', 'you', 'not', 'by', 'about', 'more', 'one', 'like', 'has', 'are', 'at', 'from', 'than', '"', 'all', '--', 'his', 'have', 'so', 'if', 'or', 'story', 'i', 'too', 'just', 'who', 'into', 'what']


# define features (keywords) of a document for a BOW/unigram baseline each feature is 'contains(keyword)' and is true or false depending on whether that keyword is in the document

In [9]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [10]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

# the feature sets are 2000 words long so you may not want to look at one
# featuresets[0]

# Train Naïve Bayes Classifier

In [11]:
# training using naive Baysian classifier, training set is approximately 90% of data
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

0.74

In [13]:
# show which features of classifier are most informative
classifier.show_most_informative_features(30)

Most Informative Features
                V_boring = True              neg : pos    =     30.3 : 1.0
             V_wonderful = True              pos : neg    =     21.7 : 1.0
            V_engrossing = True              pos : neg    =     20.4 : 1.0
              V_powerful = True              pos : neg    =     17.8 : 1.0
               V_generic = True              neg : pos    =     15.0 : 1.0
                  V_dull = True              neg : pos    =     14.5 : 1.0
               V_routine = True              neg : pos    =     14.3 : 1.0
              V_supposed = True              neg : pos    =     14.3 : 1.0
            V_refreshing = True              pos : neg    =     13.7 : 1.0
                  V_flat = True              neg : pos    =     13.4 : 1.0
                    V_90 = True              neg : pos    =     11.7 : 1.0
                 V_stale = True              neg : pos    =     11.7 : 1.0
             V_inventive = True              pos : neg    =     11.0 : 1.0

# Provided tff Lexicon

In [16]:
# create your own path to the subjclues file
SLpath = "./subjclueslen1-HLTEMNLP05.tff"

SL = Subjectivity.readSubjectivity(SLpath)

In [17]:
# how many words are in the dictionary
len(SL.keys())

6885

In [18]:
# look at words in the dictionary
print(SL['absolute'])
print(SL['shabby'])
# note what happens if the word is not there
print(SL['dog'])

['strongsubj', 'adj', False, 'neutral']
['strongsubj', 'adj', False, 'negative']


KeyError: 'dog'

In [19]:
# use multiple assignment to get the 4 items
strength, posTag, isStemmed, polarity = SL['absolute']
print(polarity)

neutral


In [145]:
# define features that include word counts of subjectivity words
# negative feature will have number of weakly negative words +
#    2 * number of strongly negative words
# positive feature has similar definition
#    not counting neutral words
def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos)
            features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

In [21]:
SL_featuresets = [(SL_features(d, word_features, SL), c) for (d, c) in documents]

In [22]:
# show document 0 and just the two sentiment lexicon features 
print(documents[0])
print(SL_featuresets[0][0]['positivecount'])
print(SL_featuresets[0][0]['negativecount'])

(['unofficially', ',', 'national', "lampoon's", 'van', 'wilder', 'is', 'son', 'of', 'animal', 'house', '.', 'officially', ',', 'it', 'is', 'twice', 'as', 'bestial', 'but', 'half', 'as', 'funny', '.'], 'neg')
2
2


In [23]:
# this gives the label of document 0
print(SL_featuresets[0][1])
# number of features for document 0
print(len(SL_featuresets[0][0].keys()))

neg
2002


In [24]:
# retrain the classifier using these features
train_set, test_set = SL_featuresets[1000:], SL_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.748

In [25]:
for sent in list(sentences)[:50]:
    for word in sent:
        if (word.endswith("n't")):
            print(sent)

['there', 'is', 'a', 'difference', 'between', 'movies', 'with', 'the', 'courage', 'to', 'go', 'over', 'the', 'top', 'and', 'movies', 'that', "don't", 'care', 'about', 'being', 'stupid']
['a', 'farce', 'of', 'a', 'parody', 'of', 'a', 'comedy', 'of', 'a', 'premise', ',', 'it', "isn't", 'a', 'comparison', 'to', 'reality', 'so', 'much', 'as', 'it', 'is', 'a', 'commentary', 'about', 'our', 'knowledge', 'of', 'films', '.']
['i', "didn't", 'laugh', '.', 'i', "didn't", 'smile', '.', 'i', 'survived', '.']
['i', "didn't", 'laugh', '.', 'i', "didn't", 'smile', '.', 'i', 'survived', '.']
['most', 'of', 'the', 'problems', 'with', 'the', 'film', "don't", 'derive', 'from', 'the', 'screenplay', ',', 'but', 'rather', 'the', 'mediocre', 'performances', 'by', 'most', 'of', 'the', 'actors', 'involved']
['the', 'lack', 'of', 'naturalness', 'makes', 'everything', 'seem', 'self-consciously', 'poetic', 'and', 'forced', '.', '.', '.', "it's", 'a', 'pity', 'that', "[nelson's]", 'achievement', "doesn't", 'match'

In [26]:
# this list of negation words includes some "approximate negators" like hardly and rarely
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [27]:
# One strategy with negation words is to negate the word following the negation word
#   other strategies negate all words up to the next punctuation
# Strategy is to go through the document words in order adding the word features,
#   but if the word follows a negation words, change the feature to negated word
# Start the feature set with all 2000 word features and 2000 Not word features set to false
def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features

In [28]:
# define the feature sets
NOT_featuresets = [(NOT_features(d, word_features, negationwords), c) for (d, c) in documents]
# show the values of a couple of example features
print(NOT_featuresets[0][0]['V_NOTcare'])
print(NOT_featuresets[0][0]['V_always'])

False
False


In [29]:
train_set, test_set = NOT_featuresets[1000:], NOT_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.782

# Modularize and Generalize Code - Gould

### Define Sample Text

In [37]:
sample = """
SRT010G900 overlap with 0305900SRT0807E00 overlap with 0305900SRT0706Z00 \
overlap with 0305900SRT0807E00 overlap with 010G900SRT0706Z00 overlap with \
010G900steam cleaned engine added dye and ran truck at high idle found gear \
cover leaking removed hood and bumper drained coolant recovered Freon removed \
coolant reservoir, ps reservoir, both radiator support, upper and lower rad hoses, \
radiator, ac compressor and bracket, alternator, fan, fan shroud, fan hub, removed \
and resealed gear cover reinstalled all removed parts refilled coolant and Freon ran \
truck at high idle no leaks repair completeOIL LEAK EXTERNALUPPER GEAR COVER GASKETLEAKS \
EPR Part Number:430716600 OIL1045962 THURSDAY 31OCT2019 05:00:47 AM
"""

### Tokenize

In [96]:
doc = tokenizer(sample.lower()) #NOTE LOWERCASE!!
tokens = [word.text for word in doc]
print(len(tokens))

109


### Create Frequency Distribution of all Words in Document

In [50]:
all_words_JG = nltk.FreqDist(tokens)
word_items_JG = all_words_JG.most_common(2000)
word_features_JG = [word for (word,count) in word_items_JG]

### Create SL Feature Set

In [146]:
SL_featuresets = [SL_features(tokens, word_features_JG, SL)]

### Provide Supplemental Negation Words and Negate Any Words in Doc After a Negation

In [57]:
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [83]:
NOT_featuresets = [NOT_features(tokens, word_features_JG, negationwords)]

In [109]:
def prep_inference(featuresets):
    for fs in featuresets:
        internal = []
        for key in fs:
            internal.append({key : fs[key]})
    
    return internal

In [147]:
internal = prep_inference(SL_featuresets)[:-2]
print("Num tokens", len(internal), "\n")
print(internal)

Num tokens 66 

[{'V_,': True}, {'V_and': True}, {'V_overlap': True}, {'V_with': True}, {'V_removed': True}, {'V_gear': True}, {'V_cover': True}, {'V_coolant': True}, {'V_fan': True}, {'V_\n': True}, {'V_0305900srt0807e00': True}, {'V_ran': True}, {'V_truck': True}, {'V_at': True}, {'V_high': True}, {'V_idle': True}, {'V_freon': True}, {'V_reservoir': True}, {'V_radiator': True}, {'V_srt010g900': True}, {'V_0305900srt0706z00': True}, {'V_010g900srt0706z00': True}, {'V_010g900steam': True}, {'V_cleaned': True}, {'V_engine': True}, {'V_added': True}, {'V_dye': True}, {'V_found': True}, {'V_leaking': True}, {'V_hood': True}, {'V_bumper': True}, {'V_drained': True}, {'V_recovered': True}, {'V_ps': True}, {'V_both': True}, {'V_support': True}, {'V_upper': True}, {'V_lower': True}, {'V_rad': True}, {'V_hoses': True}, {'V_ac': True}, {'V_compressor': True}, {'V_bracket': True}, {'V_alternator': True}, {'V_shroud': True}, {'V_hub': True}, {'V_resealed': True}, {'V_reinstalled': True}, {'V_all'

### Use NLTK Classifier

In [162]:
preds = classifier.classify_many(internal)
print(preds)

['pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg']


### Data Output
Attach polarity to tokens...mapping

In [171]:
def output_formatting(preds, internal):
    output = {}
    negatives = []
    final = {}
    for pred, tok in zip(preds, internal):
        word = list(tok.keys())[0].replace("V_", "")
        output[word] = pred
        if pred == "neg":
            negatives.append(word)
    
    #Assemble final output
    final = {
        "negative_words" : negatives,
        "polarities"     : output
    }

    return final

In [172]:
output = output_formatting(preds, internal)
print(output)

{'negative_words': ['overlap', 'removed', 'gear', 'cover', 'coolant', '\n', '0305900srt0807e00', 'ran', 'truck', 'high', 'idle', 'freon', 'reservoir', 'radiator', 'srt010g900', '0305900srt0706z00', '010g900srt0706z00', '010g900steam', 'cleaned', 'engine', 'added', 'dye', 'leaking', 'hood', 'bumper', 'drained', 'recovered', 'ps', 'support', 'upper', 'lower', 'rad', 'hoses', 'ac', 'compressor', 'bracket', 'alternator', 'shroud', 'hub', 'resealed', 'reinstalled', 'all', 'refilled', 'leaks', 'repair', 'completeoil', 'leak', 'externalupper', 'gasketleaks', 'epr', 'number:430716600', 'oil1045962', 'thursday', '31oct2019', '05:00:47', 'am'], 'polarities': {',': 'pos', 'and': 'pos', 'overlap': 'neg', 'with': 'pos', 'removed': 'neg', 'gear': 'neg', 'cover': 'neg', 'coolant': 'neg', 'fan': 'pos', '\n': 'neg', '0305900srt0807e00': 'neg', 'ran': 'neg', 'truck': 'neg', 'at': 'pos', 'high': 'neg', 'idle': 'neg', 'freon': 'neg', 'reservoir': 'neg', 'radiator': 'neg', 'srt010g900': 'neg', '0305900srt0

### Write Model to Disk

In [158]:
with open('polarity.pk', 'wb') as fout:
    pickle.dump(classifier, fout)

### Use Model from Disk

In [159]:
with open('polarity.pk', 'rb') as fin:
    polarity_nltk = pickle.load(fin)

In [163]:
preds_loaded = polarity_nltk.classify_many(internal)
preds == preds_loaded

True

### Create a Function
1. Tokenize
2. Frequency distribution
3. SL feature set
4. Prep for inference
5. Run model
6. Map preds and tokens

In [None]:
#Model Dependencies
SLpath = "./subjclueslen1-HLTEMNLP05.tff"
SL = Subjectivity.readSubjectivity(SLpath)

model_path = "./polarity.pk"
with open(model_path, 'rb') as fin:
    polarity_nltk = pickle.load(fin)

def norma_polarity(params):
    #Unpack Parameters
    text = params["text"].lower()
    
    #Tokenize Document
    doc = tokenizer(text)
    tokens = [word.text for word in doc]
    
    #Create Frequency Distribution
    all_words = nltk.FreqDist(tokens)
    word_items = all_words.most_common(5000)
    word_features = [word for (word, count) in word_items]
    
    #SL Fecture Set
    SL_featuresets = [SL_features(tokens, word_features, SL)]
    
    #Prep Feature for Inference
    internal = prep_inference(SL_featuresets)[:-2]
    
    #Run Model
    preds = polarity_nltk.classify_many(internal)
    
    #Map Predictions and Document Tokens
    output = output_formatting(preds, internal)
    
    return output