#Predicting sentiment from product reviews

#Fire up Pandas

In [1]:
import pandas as pd

#Read some product review data

Loading reviews for a set of baby products. 

In [2]:
products = pd.read_csv('amazon_baby.csv')

#Let's explore this data together

Data includes the product name, the review text and the rating of the review. 

In [3]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


#Build the word count vector for each review

In [9]:
from collections import Counter
freq_count = []
for item in products['review']:
    count = Counter(str(item).split())
    freq_count.append(count)

In [11]:
freq_count
products['word_count'] = freq_count

In [12]:
products.head()

Unnamed: 0,name,review,rating,word_count
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,"{'These': 1, 'flannel': 1, 'wipes': 2, 'are': ..."
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 2, 'came': 1, 'early': 1, 'and': 3, 'wa..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'Very': 1, 'soft': 1, 'and': 2, 'comfortable'..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'This': 1, 'is': 4, 'a': 2, 'product': 2, 'we..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'All': 1, 'of': 1, 'my': 1, 'kids': 2, 'have'..."


In [15]:
products['name'].value_counts()

Vulli Sophie the Giraffe Teether                                                                                       785
Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L                                                                    562
Infant Optics DXR-5 2.4 GHz Digital Video Baby Monitor with Night Vision                                               561
Baby Einstein Take Along Tunes                                                                                         547
Cloud b Twilight Constellation Night Light, Turtle                                                                     520
Fisher-Price Booster Seat, Blue/Green/Gray                                                                             489
Fisher-Price Rainforest Jumperoo                                                                                       450
Graco Nautilus 3-in-1 Car Seat, Matrix                                                                                 419
Leachco Snoogle 

#Examining the reviews for most-sold product:  'Vulli Sophie the Giraffe Teether'

In [16]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']

In [17]:
len(giraffe_reviews)

785

In [19]:
giraffe_reviews['rating'].value_counts()

5    535
4     95
3     62
1     56
2     37
Name: rating, dtype: int64

#Build a sentiment classifier

In [20]:
products['rating'].value_counts()

5    107054
4     33205
3     16779
1     15183
2     11310
Name: rating, dtype: int64

##Define what's a positive and a negative sentiment

We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment.  Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment.   

In [21]:
#ignore all 3* reviews
products = products[products['rating'] != 3]

In [22]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
products.head()

Unnamed: 0,name,review,rating,word_count,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 2, 'came': 1, 'early': 1, 'and': 3, 'wa...",True
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'Very': 1, 'soft': 1, 'and': 2, 'comfortable'...",True
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'This': 1, 'is': 4, 'a': 2, 'product': 2, 'we...",True
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'All': 1, 'of': 1, 'my': 1, 'kids': 2, 'have'...",True
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,"{'When': 1, 'the': 5, 'Binky': 3, 'Fairy': 3, ...",True


##Let's train the sentiment classifier

In [26]:
#train_data,test_data = products.random_split(.8, seed=0)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
dictVectorizer = DictVectorizer()
train_data_dict = dictVectorizer.fit_transform(products['word_count'])
x_train, x_test, y_train, y_test = train_test_split(train_data_dict, products.sentiment, test_size=0.2, random_state=0)

In [27]:
#sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                    # target='sentiment',
                                                    # features=['word_count'],
                                                    # validation_set=test_data)
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults

logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

#Evaluate the sentiment model

In [28]:
y_pred = logisticRegr.predict(x_test)

In [32]:
from sklearn import metrics
metrics.accuracy_score(y_pred,y_test) 

0.9296272975323079

In [33]:
from nltk import ConfusionMatrix 
print (ConfusionMatrix(list(y_pred), list(y_test)))

      |     F       |
      |     a     T |
      |     l     r |
      |     s     u |
      |     e     e |
------+-------------+
False | <3906>  891 |
 True |  1456<27098>|
------+-------------+
(row = reference; col = test)



In [34]:
# sensitivity:
print (metrics.recall_score(y_pred, y_test) )

0.94900889542621
