# Creating a Dictionary-based Sentiment Analyzer

In [1]:
import pandas as pd
import nltk
from IPython.display import display
pd.set_option('display.max_columns', None)

### Step 1: Loading in the small_corpus .csv file created in the "creating_dataset" milestone.

In [2]:
reviews = pd.read_csv("../data/sample_corpus.csv")

In [3]:
reviews.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,False,"01 5, 2001",A1NUEOW1WLQARL,B00004WI4D,Third Shift,"You heard right, this game has no SaveGame. It...","Some good ideas, but No Save Game & other prob...",978652800,2.0,,
1,1.0,False,"08 23, 2016",A3U4WRQMUFFQDS,B00ZQB28XK,J. S. Harvey,Wonder why no one is returning NMS?? sound pr...,sound pretty bad everyone here and youtube say...,1471910400,,,
2,1.0,True,"09 21, 2015",A1AGVUZU41WHDH,B00129I75I,Mom J.W.,The cords were for the X-Box 360 instead of th...,Wrong cords,1442793600,,,
3,1.0,False,"05 19, 2015",A2TCG2HV1VJP6V,B000SFK0SE,Ryan Sil. (Gamer &amp; PC/Android indie dev),Why am I the first person on Amazon to give th...,Where's the imagination?,1431993600,,,
4,1.0,True,"01 1, 2013",A38FTSF7HHCII,B007Z3UUF0,Jack Pacini,I really wanted to like this game. I'm a huge ...,Wow! How could the developers find this game fun?,1356998400,5.0,{'Format:': ' Video Game'},


### Step 2: Tokenizing the sentences and words of the reviews
Here, We're going to test different versions of word tokenizer on reviews. We'll then decide which tokenizer might be better to use.

### Treebank Word Tokenizer

In [4]:
from nltk.tokenize import TreebankWordTokenizer
from string import punctuation
import string

In [5]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
tb_tokenizer = TreebankWordTokenizer()

In [7]:
reviews["rev_text_lower"] = reviews['reviewText'].apply(lambda rev: str(rev)\
                                                        .translate(str.maketrans('', '', punctuation))\
                                                        .replace("<br />", " ")\
                                                        .lower())

In [8]:
reviews[['reviewText','rev_text_lower']].sample(2)

Unnamed: 0,reviewText,rev_text_lower
1746,Math has never been my strongest suit so I tho...,math has never been my strongest suit so i tho...
2552,works good,works good


In [9]:
reviews["tb_tokens"] = reviews['rev_text_lower'].apply(lambda rev: tb_tokenizer.tokenize(str(rev)))

In [10]:
reviews[['reviewText', 'rev_text_lower', 'tb_tokens']].sample(3)

Unnamed: 0,reviewText,rev_text_lower,tb_tokens
405,Weird tech glitches right out of the box. I ca...,weird tech glitches right out of the box i can...,"[weird, tech, glitches, right, out, of, the, b..."
34,"Had weird sexual thing in the middle, graphic ...",had weird sexual thing in the middle graphic c...,"[had, weird, sexual, thing, in, the, middle, g..."
3449,Love it!! Best buy ever!! Great seller!!,love it best buy ever great seller,"[love, it, best, buy, ever, great, seller]"


### Casual Tokenizer

In [11]:
from nltk.tokenize.casual import casual_tokenize

In [12]:
reviews['casual_tokens'] = reviews['rev_text_lower'].apply(lambda rev: casual_tokenize(str(rev)))

In [13]:
reviews[['reviewText','casual_tokens','tb_tokens']].sample(3)

Unnamed: 0,reviewText,casual_tokens,tb_tokens
2348,Decent price and quality.\nFeels good in the h...,"[decent, price, and, quality, feels, good, in,...","[decent, price, and, quality, feels, good, in,..."
4182,They have fixed everything wrong with the alre...,"[they, have, fixed, everything, wrong, with, t...","[they, have, fixed, everything, wrong, with, t..."
3463,Assassin's Creed is one of those series that i...,"[assassins, creed, is, one, of, those, series,...","[assassins, creed, is, one, of, those, series,..."


### Removing StopWords
This part has been remvoed as removing stop words is not good for sentiment analysis at all!!

In [14]:
#nltk.download('stopwords')

In [15]:
#stop_words = nltk.corpus.stopwords.words('english')

In [16]:
#stop_words.remove("no")

In [17]:
#stop_words.remove("not")

In [18]:
#print(stop_words)

In [19]:
#"not" in stop_words

In [20]:
#len(stop_words)

In [21]:
#from string import punctuation
#print(punctuation)

In [22]:
#reviews['tokens_nosw'] = reviews['tb_tokens'].\
#    apply(lambda words: [w for w in words if w not in stop_words and w not in punctuation and w != ""])

In [23]:
#reviews[['tb_tokens','tokens_nosw']].sample(3)

### Stemming

In [24]:
from nltk.stem.porter import PorterStemmer

In [25]:
stemmer = PorterStemmer()

In [26]:
reviews['tokens_stemmed'] = reviews['tb_tokens'].apply(lambda words: [stemmer.stem(w) for w in words])

In [27]:
reviews[['tb_tokens','tokens_stemmed']].sample(3)

Unnamed: 0,tb_tokens,tokens_stemmed
1350,"[save, your, money, and, get, the, eforcity, o...","[save, your, money, and, get, the, eforc, one,..."
3696,"[my, son, got, this, a, few, weeks, ago, and, ...","[my, son, got, thi, a, few, week, ago, and, pa..."
3289,"[one, of, the, best, dragonball, z, games, yey...","[one, of, the, best, dragonbal, z, game, yey, ..."


### Lemmatisation

In [28]:
nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')

nltk.download('universal_tagset')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rickylam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/rickylam/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rickylam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/rickylam/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [29]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

In [30]:
swn

<SentiWordNetCorpusReader in '/Users/rickylam/nltk_data/corpora/sentiwordnet'>

In [31]:
def penn_to_wn(tag):
    """
        Convert between the PennTreebank tags to simple Wordnet tags
        PennTreebank tags:
        https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        simple Wordnet tags:
        https://www.nltk.org/_modules/nltk/tag/mapping.html
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [32]:
lemmatizer = WordNetLemmatizer()
def get_lemas(tokens):
    lemmas = []
    for token in tokens:
        pos = penn_to_wn(pos_tag([token])[0][1])
        if pos:
            lemma = lemmatizer.lemmatize(token, pos)
            if lemma:
                lemmas.append(lemma)
    return lemmas

In [33]:
def get_lemas_breakdown(tokens):
    lemmas = []
    for token in tokens:
        print('=====')
        print('>>>')
        print(token)
        print(pos_tag([token]))
#         universal, wsj, brown
        print(pos_tag([token])[0])
        print(pos_tag([token])[0][1])
        pos = penn_to_wn(pos_tag([token])[0][1])
        if pos:
            print('>>')
            lemma = lemmatizer.lemmatize(token, pos)
            print(lemma)
            if lemma:
                print('>')
                lemmas.append(lemma)
    return lemmas

In [34]:
reviews['tb_tokens'][4222]

['a',
 'great',
 'game',
 'to',
 'pass',
 'time',
 'and',
 'have',
 'tons',
 'of',
 'fun',
 'doing',
 'it',
 'the',
 'mini',
 'games',
 'are',
 'alot',
 'of',
 'fun',
 'on',
 'the',
 'ds']

In [35]:
get_lemas_breakdown(reviews['tb_tokens'][4222])

=====
>>>
a
[('a', 'DT')]
('a', 'DT')
DT
=====
>>>
great
[('great', 'JJ')]
('great', 'JJ')
JJ
>>
great
>
=====
>>>
game
[('game', 'NN')]
('game', 'NN')
NN
>>
game
>
=====
>>>
to
[('to', 'TO')]
('to', 'TO')
TO
=====
>>>
pass
[('pass', 'NN')]
('pass', 'NN')
NN
>>
pas
>
=====
>>>
time
[('time', 'NN')]
('time', 'NN')
NN
>>
time
>
=====
>>>
and
[('and', 'CC')]
('and', 'CC')
CC
=====
>>>
have
[('have', 'VB')]
('have', 'VB')
VB
>>
have
>
=====
>>>
tons
[('tons', 'NNS')]
('tons', 'NNS')
NNS
>>
ton
>
=====
>>>
of
[('of', 'IN')]
('of', 'IN')
IN
=====
>>>
fun
[('fun', 'NN')]
('fun', 'NN')
NN
>>
fun
>
=====
>>>
doing
[('doing', 'VBG')]
('doing', 'VBG')
VBG
>>
do
>
=====
>>>
it
[('it', 'PRP')]
('it', 'PRP')
PRP
=====
>>>
the
[('the', 'DT')]
('the', 'DT')
DT
=====
>>>
mini
[('mini', 'NN')]
('mini', 'NN')
NN
>>
mini
>
=====
>>>
games
[('games', 'NNS')]
('games', 'NNS')
NNS
>>
game
>
=====
>>>
are
[('are', 'VBP')]
('are', 'VBP')
VBP
>>
be
>
=====
>>>
alot
[('alot', 'NN')]
('alot', 'NN')
NN
>>
alot
>
=

['great',
 'game',
 'pas',
 'time',
 'have',
 'ton',
 'fun',
 'do',
 'mini',
 'game',
 'be',
 'alot',
 'fun',
 'd']

In [36]:
reviews['lemmas'] = reviews['tb_tokens'].apply(lambda tokens: get_lemas(tokens))

In [37]:
reviews[['reviewText','tokens_stemmed','lemmas']].sample(2)

Unnamed: 0,reviewText,tokens_stemmed,lemmas
2102,Though I use the controller at no more than 1 ...,"[though, i, use, the, control, at, no, more, t...","[i, use, controller, more, meter, away, foot, ..."
1929,Meh.....,[meh],[meh]


### Sentiment Predictor Baseline Model

In [38]:
def get_sentiment_score(tokens):
    score = 0
    tags = pos_tag(tokens)
    for word, tag in tags:
        wn_tag = penn_to_wn(tag)
        if not wn_tag:
            continue
        synsets = wn.synsets(word, pos=wn_tag)
        if not synsets:
            continue
        
        #most common set:
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        
        score += (swn_synset.pos_score() - swn_synset.neg_score())
        
    return score
                    

In [39]:
## test
swn.senti_synset(wn.synsets("Mary", wn.NOUN)[0].name()).neg_score()

0.0

In [40]:
wn.synsets("Mary", wn.NOUN)[0].definition()

'the mother of Jesus; Christians refer to her as the Virgin Mary; she is especially honored by Roman Catholics'

In [41]:
reviews['sentiment_score'] = reviews['lemmas'].apply(lambda tokens: get_sentiment_score(tokens))

In [42]:
reviews[['reviewText','lemmas','sentiment_score']].sample(5)

Unnamed: 0,reviewText,lemmas,sentiment_score
3325,Excellent game!!!!,"[excellent, game]",1.0
1303,I just do not understand why Amazon does not r...,"[i, just, do, not, understand, amazon, do, not...",0.125
3527,I could go on and on about how much fun this g...,"[i, go, much, fun, game, be, i, save, time, ju...",2.5
2545,Never was a tomb raider fan but after playing ...,"[never, be, tomb, raider, fan, play, i, say, g...",2.125
3371,The review score of this game is one of the mo...,"[review, score, game, be, most, eloquent, exam...",5.069


In [43]:
reviews[reviews['reviewText'].str.len()<=100][['reviewText','lemmas','sentiment_score']].sample(5)

Unnamed: 0,reviewText,lemmas,sentiment_score
3788,awesome i love destroying stuff its fun to pla...,"[awesome, i, love, destroy, stuff, fun, play, ...",2.0
2371,Kinda difficult to install correctly (eg no bu...,"[kinda, difficult, install, correctly, eg, bub...",0.125
2109,good product but i feel this should've been in...,"[good, product, i, feel, shouldve, be, include...",0.625
1937,OK,[ok],0.0
4276,6 year old loves it,"[year, old, love]",1.0
