### Creating a Dictionary-based Sentiment Analyzer

In [36]:
import pandas as pd
import nltk
pd.set_option('display.max_columns', None)
import nltk


Step 1: Loading in the small_corpus .csv file created in the "creating_dataset" milestone.

In [37]:
reviews = pd.read_csv("/Users/lorenzo/Desktop/Sentiment-Analysis-NLP-for-Marketing/data/small_corpus.csv")

Step 2: Tokenizing the sentences and words of the reviews
Here, We're going to test different versions of word tokenizer on reviews. We'll then decide which tokenizer might be better to use.



## Treebank Word Tokenizer

In [38]:
from nltk.tokenize import TreebankWordTokenizer
from string import punctuation
import string

In [39]:
tb_tokenizer = TreebankWordTokenizer()

In [40]:
reviews["rev_text_lower"] = reviews['reviewText'].apply(lambda rev: str(rev)\
                                                        .translate(str.maketrans('', '', punctuation))\
                                                        .replace("<br />", " ")\
                                                        .lower())

In [41]:
reviews[['reviewText','rev_text_lower']].sample(2)

Unnamed: 0,reviewText,rev_text_lower
3698,These cables to the trick. My dog had eaten th...,these cables to the trick my dog had eaten the...
3728,I've been playing this game for 2 years single...,ive been playing this game for 2 years single ...


In [42]:
reviews["tb_tokens"] = reviews['rev_text_lower'].apply(lambda rev: tb_tokenizer.tokenize(str(rev)))

### Casual Tokenizer

In [44]:
from nltk.tokenize.casual import casual_tokenize

In [45]:
reviews['casual_tokens'] = reviews['rev_text_lower'].apply(lambda rev: casual_tokenize(str(rev)))

In [46]:
reviews[['reviewText','casual_tokens','tb_tokens']].sample(3)

Unnamed: 0,reviewText,casual_tokens,tb_tokens
741,I loved &quot;Max Payne&quot;. From the day I ...,"[i, loved, quotmax, paynequot, from, the, day,...","[i, loved, quotmax, paynequot, from, the, day,..."
2555,I purchased the GOW3 season pass and it was wo...,"[i, purchased, the, gow, 3, season, pass, and,...","[i, purchased, the, gow3, season, pass, and, i..."
208,Runescape's only deserving claims to fame are ...,"[runescapes, only, deserving, claims, to, fame...","[runescapes, only, deserving, claims, to, fame..."


### Stemming

In [47]:
from nltk.stem.porter import PorterStemmer

In [48]:
stemmer = PorterStemmer()

In [49]:
reviews['tokens_stemmed'] = reviews['tb_tokens'].apply(lambda words: [stemmer.stem(w) for w in words])

In [78]:
reviews[['tb_tokens','tokens_stemmed']].sample(3)

Unnamed: 0,tb_tokens,tokens_stemmed
3590,"[im, always, a, little, wary, when, i, buy, a,...","[im, alway, a, littl, wari, when, i, buy, a, n..."
3927,"[the, last, story, has, finally, hit, the, uni...","[the, last, stori, ha, final, hit, the, unit, ..."
907,"[hello, today, you, may, be, deciding, wether,...","[hello, today, you, may, be, decid, wether, to..."


### Lemmatisation

In [51]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

In [52]:
def penn_to_wn(tag):
    """
        Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

In [58]:
lemmatizer = WordNetLemmatizer()
def get_lemmas(tokens):
    lemmas = []
    for token in tokens:
        pos = penn_to_wn(pos_tag([token])[0][1])
        if pos:
            lemma = lemmatizer.lemmatize(token, pos)
            if lemma:
                lemmas.append(lemma)
    return lemmas

0       [this, game, looks, good, and, the, story, is,...
1       [i, want, to, like, this, game, i, really, do,...
2       [this, game, is, horrible, the, best, part, of...
3       [i, loved, the, original, alice, although, i, ...
4       [i, was, very, excited, to, get, this, for, my...
                              ...                        
4495    [this, has, got, to, be, the, best, one, of, t...
4496    [this, game, is, very, good, there, are, lots,...
4497    [one, of, the, best, 5, games, aside, from, th...
4498    [some, of, you, might, be, wondering, is, the,...
4499    [my, son, has, managed, to, improve, his, guit...
Name: tb_tokens, Length: 4500, dtype: object

In [84]:
reviews['lemmas'] = reviews['tb_tokens'].apply(lambda tokens: get_lemmas(tokens))

In [83]:
reviews[['reviewText','tokens_stemmed','lemmas']].sample(2)

Unnamed: 0,reviewText,tokens_stemmed,lemmas
1710,"Being a huge Star Wars fan as a kid, and moder...","[be, a, huge, star, war, fan, as, a, kid, and,...","[be, huge, star, war, fan, kid, moderately, ad..."
872,The other reviews have discussed each major sh...,"[the, other, review, have, discuss, each, majo...","[other, review, have, discuss, major, shortcom..."


In [85]:
def get_sentiment_score(tokens):
    score = 0
    tags = pos_tag(tokens)
    for word, tag in tags:
        wn_tag = penn_to_wn(tag)
        if not wn_tag:
            continue
        synsets = wn.synsets(word, pos=wn_tag)
        if not synsets:
            continue
        
        #most common set:
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        
        score += (swn_synset.pos_score() - swn_synset.neg_score())
        
    return score
                    

Sentiment Predictor Baseline Model

In [86]:

## test
swn.senti_synset(wn.synsets("perfect", wn.ADJ)[0].name()).pos_score()

0.625

In [87]:
reviews['sentiment_score'] = reviews['lemmas'].apply(lambda tokens: get_sentiment_score(tokens))

In [88]:
reviews[['reviewText','lemmas','sentiment_score']].sample(5)

Unnamed: 0,reviewText,lemmas,sentiment_score
3592,"This is a funny game.In Play Station 3, you ne...","[be, funny, gamein, play, station, need, just,...",-0.5
3213,df fgf g hvg f dd d fg hyu jng ded r45 y ed e...,"[df, fgf, g, hvg, f, dd, d, fg, hyu, jng, ded,...",-1.0
3067,"I have played different shooters in the past, ...","[i, have, played, different, shooter, past, ga...",3.25
235,"Half the time it doesn't even turn on, and no,...","[half, time, doesnt, even, turn, not, finally,...",0.375
1777,EA apparently released this game before it was...,"[ea, apparently, release, game, be, finish, gr...",-0.125
