### Lexicon Knowledge Extraction with Sentiment Polarity Computation 

In [261]:
import nltk
from nltk.classify.scikitlearn import SklearnClassifier
import pandas as pd
from nltk.classify import ClassifierI
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re
import collections
from nltk.stem.porter import *

In [262]:
all_words = {}
documents = []

In [263]:
train = pd.read_csv('amazon_cells_labelled.csv')


In [264]:
train.Label.value_counts()

1    500
0    500
Name: Label, dtype: int64

### <i><b>Preprocessing the words:</b></i><br>

For each data in the selected text dataset:<br>
    Preprocess the data to remove noise - <b>Removed special characters and numbers</b><br>
    Reduce the word to its original form - <b>Lemmatization</b> <br>
    Segment the word if necessary - <b>Tokenization</b> <br>
    Tag the Part Of Speech - <b>POS tagging</b><br>
    Removed stop words and high frequency words<br>
<br>

In [265]:
def calc_polarityscore(sw,ocrp,ocrn):
    t1 = ocrn+1
    t2 = ocrp+1

    polarityscore = ((float(sw) ** float(t1)) - (float(sw) ** float(t2)) ) / (1 - float(sw))
    return polarityscore

In [266]:
stop_words = list(set(stopwords.words('english')))
st = WordNetLemmatizer()
freq = pd.Series(' '.join(train['Comment']).split()).value_counts()[:10]

sentiment = {}
tokenizedComment = []

def preprocessText(document):
    cleaned = re.sub(r'[^(a-zA-Z)\s]',' ', document)
    
    #cleaned = cleaned.lower()
    
    #lemmatize the comment
    lemmatized = st.lemmatize(cleaned)
    #stemmer = PorterStemmer()
    #lemmatized = stemmer.stem(cleaned)
    #convert the lemmatized comment into tokens
    tokenized = word_tokenize(lemmatized)
    
    #tagging the tokens
    #pos = nltk.pos_tag(tokenized)

    #Removing stop words
    stopped = [w for w in tokenized if not w in stop_words]
    
    #remove the high frequency words since they dont contribute to the classification 
    highfreq =  [w for w in stopped if not w in freq]
    
    return highfreq

In [267]:
train['tokenizedComment'] = train['Comment'].apply(lambda x: preprocessText(x))

In [268]:
train.head()

Unnamed: 0,Comment,Label,tokenizedComment
0,So there is no way for me to plug it in here i...,0,"[So, way, plug, US, unless, go, converter]"
1,Good case Excellent value.,1,"[Good, case, Excellent, value]"
2,Great for the jawbone.,1,"[Great, jawbone]"
3,Tied to charger for conversations lasting more...,0,"[Tied, charger, conversations, lasting, minute..."
4,The mic is great.,1,"[The, mic, great]"


In [269]:
train_positive = train.loc[train['Label']==1]
#dataset of all positive reviews
train_positive.head()
train_negative = train.loc[train['Label']==0]
#dataset of all negative reviews
train_negative.head()

Unnamed: 0,Comment,Label,tokenizedComment
0,So there is no way for me to plug it in here i...,0,"[So, way, plug, US, unless, go, converter]"
3,Tied to charger for conversations lasting more...,0,"[Tied, charger, conversations, lasting, minute..."
5,I have to jiggle the plug to get it to line up...,0,"[jiggle, plug, get, line, right, get, decent, ..."
6,If you have several dozen or several hundred c...,0,"[If, several, dozen, several, hundred, contact..."
8,Needless to say I wasted my money.,0,"[Needless, say, wasted, money]"


In [270]:
#Calculate the number of positive and negative documents for each token in the corpus

In [271]:
DFP = {}
#Document frequency for negative text
DFN = {}

#To find the word and no of positive documents that contain the word
for index, row in train_positive.iterrows():
    tokens = row['tokenizedComment']
    for w in tokens:
        try:
            DFP[w].add(index)
        except:
            DFP[w] = {index}

for index, row in train_negative.iterrows():
    tokens = row['tokenizedComment']
    for w in tokens:
        try:
            DFN[w].add(index)
        except:
            DFN[w] = {index}

#The above list gives document numbers, we need count of all documents that contain the word, so take len
for i in DFP:
    DFP[i] = len(DFP[i])

for i in DFN:
    DFN[i] = len(DFN[i])

In [272]:
def getOcr(word,posneg):

    if(posneg=='P'):
        if word in DFP:
            return DFP[word]
        else:
            return 0
        
    if(posneg=='N'):
        if word in DFN:
            return DFN[word]
        else:
            return 0

In [273]:
for index, row in train_positive.iterrows():
    
    comment = row["tokenizedComment"]
    
    #tagging the tokens
    pos = nltk.pos_tag(comment)
    
    #for each word that occurs in text, calculate the sentiment weight
    for w in pos:
         if w[1][0] in ('J'):
            sentiment[w[0]] = 0.5
         elif w[1][0] in ('VB'):
            sentiment[w[0]] = 0.333
         elif w[1][0] in ('NN'):
            sentiment[w[0]] = 0.2
         else:
            sentiment[w[0]] = 0
         
        
         ocrp = getOcr(w[0],'P')
         ocrn = getOcr(w[0],'N')
        
         ps = calc_polarityscore(sentiment[w[0]],ocrp,ocrn)
    
         all_words[w[0]] = ps


## List of words with their polarity

In [274]:
print(all_words)

{'Good': 0.999969482421875, 'case': 2.5599934464000013e-08, 'Excellent': 0.4998779296875, 'value': 0.24960000000000002, 'Great': 0.25, 'jawbone': 0.24, 'The': 0.0, 'mic': 0.04000000000000001, 'great': 0.03125, 'If': 0.0, 'Razr': 0.0, 'owner': 0.19999999999999998, 'must': 0.0, 'And': 0.0, 'sound': 1.0291341697585879e-07, 'quality': 4.095999999973159e-11, 'He': 0.0, 'impressed': 0.03125, 'going': -0.04922240732100001, 'original': 0.4375, 'battery': -2.0478689280000014e-10, 'extended': 0.443889, 'Very': 0.0, 'good': 0.00012207031249644729, 'though': 0.0, 'Highly': 0.19999999999999998, 'recommend': 0.0006807438258283551, 'one': 0.0, 'blue': 0.25, 'tooth': 0.0, 'phone': 2.950534578496966e-50, 'So': 0.0, 'Far': 0.19999999999999998, 'Works': 0.24999999995904, 'bought': -0.00390625, 'use': 0.0, 'Kindle': 0.19999999999999998, 'Fire': 0.24, 'absolutely': 0.0, 'loved': 0.33299999999999996, 'yet': 0.0, 'run': 0.443889, 'new': 0.005859375, 'two': 0.0, 'bars': -0.04800000000000001, 'three': 0.0, 'da

## Top 3 Positive words

In [275]:
sortedPosWords = sorted(all_words.items(), key=lambda x: x[1], reverse=True)

In [276]:
for i in range(3):
    print(sortedPosWords[i][0])

Good
nice
excellent


## Top 3 Negative words

In [277]:
sortedNegWords = sorted(all_words.items(), key=lambda x: x[1])

In [280]:
for i in range(3):
    print(sortedNegWords[i][0])

big
signal
turn


### The NLTK POS tagger's accuracy in tagging a word as objective is not very accurate. <br>The results of this lexicon based method can be greatly improved by better POS tagger