In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import utils
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import numpy as np

%matplotlib inline

In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\j.martins\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
# Read CSV file with preprocessed reviews into a DataFrame
path = './review-analysis-teamc/data/processed_reviews.csv'
proc_reviews = pd.read_csv(path)

In [4]:
proc_reviews.head()

Unnamed: 0.1,Unnamed: 0,id,product_id,user_id,helpfulness_numerator,helpfulness_denominator,score,time,summary,text,date,year,sentiment,helpfulness_ratio,word_count,duplicated,preprocessed_text,lemmatized_text
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,2011-04-27,2011,positive,1.0,48,False,bought several vitality canned dog food produc...,bought several vitality canned dog food produc...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2012-09-07,2012,negative,,31,False,product arrived labeled jumbo salted peanuts ....,product arrived labeled jumbo salted peanut .....
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,2008-08-18,2008,positive,1.0,94,False,confection around centuries light pillowy citr...,confection around century light pillowy citrus...
3,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,2012-10-21,2012,positive,,27,False,great taffy great price wide assortment yummy ...,great taffy great price wide assortment yummy ...
4,5,6,B006K2ZZ7K,ADT0SRK1MGOEU,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...,2012-07-12,2012,positive,,72,False,got wild hair taffy ordered five pound bag taf...,got wild hair taffy ordered five pound bag taf...


### Using Vader sentiment analyser

In [5]:
sia = SentimentIntensityAnalyzer()

In [6]:
reviews_score = proc_reviews[['score','preprocessed_text','lemmatized_text','helpfulness_ratio']].copy()

In [7]:
reviews_score['polarity_scores'] = reviews_score.apply(lambda x: sia.polarity_scores(x['lemmatized_text']), axis=1)

In [8]:
reviews_score['neg'] = reviews_score.apply(lambda x: x['polarity_scores'].get('neg'), axis=1)
reviews_score['pos'] = reviews_score.apply(lambda x: x['polarity_scores'].get('pos'), axis=1)
reviews_score['neu'] = reviews_score.apply(lambda x: x['polarity_scores'].get('neu'), axis=1)
reviews_score['compound'] = reviews_score.apply(lambda x: x['polarity_scores'].get('compound'), axis=1)

In [9]:
polarities_scores = reviews_score[['score','neg','pos','neu','compound']]

In [10]:
polarities_scores.groupby('score')['neg'].agg('mean')

score
1    0.127439
2    0.097876
3    0.074211
4    0.052139
5    0.042526
Name: neg, dtype: float64

In [11]:
polarities_scores.groupby('score')['pos'].agg('mean')

score
1    0.174660
2    0.208367
3    0.240918
4    0.297121
5    0.339744
Name: pos, dtype: float64

In [12]:
polarities_scores.groupby('score')['compound'].agg('mean')

score
1    0.194057
2    0.426615
3    0.589708
4    0.740983
5    0.781632
Name: compound, dtype: float64

In [13]:
polarities_scores.groupby('score')['compound'].agg('std')

score
1    0.600114
2    0.534285
3    0.452375
4    0.337266
5    0.291891
Name: compound, dtype: float64

In [14]:
polarities_scores.groupby('score')['compound'].agg('describe')

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,36299.0,0.194057,0.600114,-0.9947,-0.34,0.3182,0.7511,0.9997
2,20803.0,0.426615,0.534285,-0.9953,0.04105,0.6249,0.875,0.9992
3,29771.0,0.589708,0.452375,-0.9881,0.4211,0.7806,0.9228,0.9996
4,56089.0,0.740983,0.337266,-0.9951,0.6908,0.872,0.9473,0.9998
5,250902.0,0.781632,0.291891,-0.9994,0.7506,0.8885,0.9485,0.9999


From the compound polarity, which varies between -1 and 1, we would expect the negative reviews to have much lower values. However, this does not seem to be the case, which means that polarity alone is not enough to predict the score. 

Looking at the standard deviation of the polarity in each group, we can see that there is a big spread in compound polarity over the reviews for each of the scores. This suggests that the sentiment in the reviews does not translate well to the score the reviewer gives. Sometimes it may write a review whose text appears to have a negative sentiment but then the score is high, for example.

### Using TextBlob sentiment analyser

In [19]:
def text_polarity(x):
    sentiment = TextBlob(x)
    return sentiment.sentiment.polarity

In [20]:
reviews_score_blob = proc_reviews[['score','preprocessed_text','lemmatized_text','helpfulness_ratio']].copy()

In [21]:
reviews_score_blob['polarity'] = reviews_score_blob.apply(lambda x: text_polarity(x['lemmatized_text']), axis=1)

In [22]:
reviews_score_blob.groupby('score')['polarity'].agg('describe')

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,36299.0,0.01626,0.247618,-1.0,-0.103016,0.024262,0.16,1.0
2,20803.0,0.103303,0.208165,-1.0,-0.001236,0.102778,0.219175,1.0
3,29771.0,0.1687,0.191347,-1.0,0.056071,0.162222,0.273297,1.0
4,56089.0,0.245293,0.191331,-1.0,0.125,0.230952,0.35,1.0
5,250902.0,0.310905,0.206345,-1.0,0.176264,0.299596,0.433333,1.0


In [40]:
reviews_score_blob[reviews_score_blob['helpfulness_ratio']>0.8].groupby('score')['polarity'].agg('describe')

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,9188.0,0.022093,0.226293,-1.0,-0.092171,0.031318,0.153899,1.0
2,5120.0,0.100469,0.195823,-1.0,0.0,0.101375,0.206705,1.0
3,7817.0,0.161625,0.184406,-0.8,0.054545,0.153274,0.261111,1.0
4,19050.0,0.23702,0.187182,-1.0,0.12,0.222462,0.339646,1.0
5,100249.0,0.295735,0.200479,-1.0,0.166667,0.28,0.41047,1.0
