In [13]:
import nltk
nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/shalomdurga/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.


True

In [30]:
import pandas as pd
from nltk.corpus import opinion_lexicon
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import numpy as np
import altair as alt

#### Create Dictionary-based sentiment analyzer.



In [31]:
#Load the Data

df = pd.read_csv("DATA/raw/review_corpus.tsv", sep="\t")

In [32]:
df.head()

Unnamed: 0,rating,review
0,1.0,Yet another garbage CoD game. Zombies is unpla...
1,1.0,$80? .... No way. This is NOT worth $80. $80?....
2,1.0,One of the worst games ever. I bought and down...
3,1.0,I did a lot of homework before I decided to by...
4,1.0,"I am really into RPG games, I loved Skyrim, Bo..."


In [33]:
ratings = list(df['rating'])
reviews = list(df['review'])

In [34]:
positive_wds = set(opinion_lexicon.positive())
negative_wds = set(opinion_lexicon.negative())

In [35]:
print("Length of positive words : {}".format(len(positive_wds)))

Length of positive words : 2006


In [36]:
print("Length of negative words : {}".format(len(negative_wds)))

Length of negative words : 4783


In [37]:
negative_wds

{'invidiousness',
 'vanity',
 'injure',
 'subversion',
 'sore',
 'uncivil',
 'bleakly',
 'incomplete',
 'miscalculate',
 'harasses',
 'struggle',
 'unavailable',
 'sorrow',
 'overstatement',
 'niggle',
 'illicit',
 'anti-proliferation',
 'brutalize',
 'detracts',
 'despot',
 'offensiveness',
 'uncivilized',
 'pessimistically',
 'wily',
 'bias',
 'humming',
 'fainthearted',
 'insolvent',
 'boil',
 'suicide',
 'volatility',
 'fussy',
 'plight',
 'oversize',
 'breakups',
 'astray',
 'frightening',
 'uglier',
 'uncompromising',
 'cliche',
 'precariously',
 'misleadingly',
 'unjustified',
 'misdirection',
 'wrath',
 'eccentricity',
 'uneasiness',
 'inhumanity',
 'fanatic',
 'sullen',
 'evasion',
 'lier',
 'sobering',
 'stooges',
 'absurdity',
 'ostracize',
 'prideful',
 'vulnerable',
 'unprepared',
 'fiasco',
 'rocky',
 'joke',
 'taunt',
 'lorn',
 'noisy',
 'weaker',
 'sorely',
 'adverse',
 'solicitude',
 'prohibitive',
 'deviation',
 'deform',
 'incitement',
 'tempest',
 'shark',
 'ashamed

In [51]:
def score_sent(sent):
    """Returns a score btw -1 and 1"""
    sent = [e.lower() for e in sent if e.isalnum()]
    total = len(sent)
    pos = len([e for e in sent if e in positive_wds])
    neg = len([e for e in sent if e in negative_wds])
    if total > 0:
        return (pos - neg) / total
    else:
        return 0

In [68]:
sent = ['Yet', 'another', 'garbage', 'CoD', 'game', '.']

In [69]:
sent = [e.lower() for e in sent if e.isalnum()]

In [71]:
total = len(sent)
print(total)

5


In [72]:
pos = len([e for e in sent if e in positive_wds])
neg = len([e for e in sent if e in negative_wds])



In [74]:
[e for e in sent if e in negative_wds]

['garbage']

In [66]:
def score_review(review):
    sentiment_scores = []
    sents = sent_tokenize(review)
    for sent in sents:
        wds = word_tokenize(sent)
        sent_scores = score_sent(wds)
        sentiment_scores.append(sent_scores)
    return sum(sentiment_scores) / len(sentiment_scores)
    

In [58]:
reviews[0]

"Yet another garbage CoD game. Zombies is unplayable because they're too cheap to give us dedicated servers. Someone leaves? GG because everyone is getting DC'd. Jetpacks are back and just as lame and boring as ever. Advanced Warfare 2."

In [67]:
sentiment_scores = []
for sent in sent_tokenize(reviews[0]):
    wds = word_tokenize(sent)
    print(wds)
    sentiment_scores.append(score_sent(wds))

sum(sentiment_scores) / len(sentiment_scores)





['Yet', 'another', 'garbage', 'CoD', 'game', '.']
['Zombies', 'is', 'unplayable', 'because', 'they', "'re", 'too', 'cheap', 'to', 'give', 'us', 'dedicated', 'servers', '.']
['Someone', 'leaves', '?']
['GG', 'because', 'everyone', 'is', 'getting', 'DC', "'d", '.']
['Jetpacks', 'are', 'back', 'and', 'just', 'as', 'lame', 'and', 'boring', 'as', 'ever', '.']
['Advanced', 'Warfare', '2', '.']


-0.008080808080808086

In [53]:
review_sentiments = [score_review(e) for e in reviews ]

In [54]:
df = pd.DataFrame({'rating' : ratings,
             'review' : reviews,
             "review dictionary based sentiment": review_sentiments
             })

In [75]:
with open('DATA/processed/dictionary_based_sentiment.tsv','w') as outfile:
    outfile.write(df.to_csv(index= False,sep = "\t"))

#### Exploratory Data Analysis

In [76]:
from collections import Counter


In [78]:
rating_counts = Counter(ratings)

In [80]:
rating_counts

Counter({1.0: 1000, 2.0: 1000, 3.0: 1000, 4.0: 1000, 5.0: 1000})

In [92]:
data1 = pd.DataFrame({
    "ratings" : [str(e) for e in rating_counts.keys() ],
    "counts" : rating_counts.values()})

In [94]:
chart1 = alt.Chart(data1).mark_bar().encode(x= "ratings",y = "counts")

In [96]:
chart1.save('plots/01/ratings_counts.html')

In [97]:
hist, bin_edges = np.histogram(review_sentiments,density=True) 

In [100]:
labels = list(zip(bin_edges, bin_edges[1:]))
labels = [(str(e[0]), str(e[1])) for e in labels]
labels = [" ".join(e) for e in labels]

In [102]:
list(zip(bin_edges, bin_edges[1:]))

[(-1.0, -0.8),
 (-0.8, -0.6),
 (-0.6, -0.3999999999999999),
 (-0.3999999999999999, -0.19999999999999996),
 (-0.19999999999999996, 0.0),
 (0.0, 0.20000000000000018),
 (0.20000000000000018, 0.40000000000000013),
 (0.40000000000000013, 0.6000000000000001),
 (0.6000000000000001, 0.8),
 (0.8, 1.0)]

In [104]:
data2 = pd.DataFrame({"sentiment scores": labels, "counts": hist})
data2

Unnamed: 0,sentiment scores,counts
0,-1.0 -0.8,0.001
1,-0.8 -0.6,0.003
2,-0.6 -0.3999999999999999,0.015
3,-0.3999999999999999 -0.19999999999999996,0.087
4,-0.19999999999999996 0.0,1.451
5,0.0 0.20000000000000018,2.949
6,0.20000000000000018 0.40000000000000013,0.416
7,0.40000000000000013 0.6000000000000001,0.063
8,0.6000000000000001 0.8,0.011
9,0.8 1.0,0.004


In [105]:
chart2 = (
    alt.Chart(data2)
    .mark_bar()
    .encode(x=alt.X("sentiment scores", sort=labels), y="counts")
)

In [106]:
chart2

In [107]:
chart2.save("plots/01/review_sentiments.html")

In [108]:
# is there any relationship btw review scores and sentiments?


In [112]:
source = pd.DataFrame(
    {"ratings": [str(e) for e in ratings], "sentiments": review_sentiments}
)
source

Unnamed: 0,ratings,sentiments
0,1.0,-0.008081
1,1.0,0.033333
2,1.0,-0.154938
3,1.0,0.039000
4,1.0,-0.161413
...,...,...
4995,5.0,0.333333
4996,5.0,0.142857
4997,5.0,0.062630
4998,5.0,0.055556


In [110]:
chart4 = (
    alt.Chart(source)
    .mark_circle(size=60)
    .encode(
        x="ratings", y="sentiments", color="ratings", tooltip=["ratings", "sentiments"]
    )
    .interactive()
)


In [111]:
chart4

In [113]:
chart4.save("plots/01/reviews_ratings_vs_sentiment.html")


 # test correlation

In [114]:
from scipy.stats import pearsonr , spearmanr

corr1,_ = pearsonr(ratings,review_sentiments)

In [116]:
corr1

0.4968631991608496

In [117]:
scor1, _ = spearmanr(ratings, review_sentiments)

print(scor1)

0.557482836188016


In [118]:
from nltk.sentiment.util import mark_negation


t = "I received these on time and no problems. No damages battlfield never fails"
print(mark_negation(t.split()))

['I', 'received', 'these', 'on', 'time', 'and', 'no', 'problems._NEG', 'No_NEG', 'damages_NEG', 'battlfield_NEG', 'never_NEG', 'fails_NEG']
