In [134]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.feature_extraction import DictVectorizer
#from sklearn.metrics import mean_squared_error, r2_score
#from math import sqrt

# Read some product review data

In [38]:
products = pd.read_csv("amazon_baby.csv")

# Let's explore this data together

In [39]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


# Build the word count vector for each review

In [45]:
def word_count(sentence):
    word_count_dict = dict(Counter(word.lower() for word in nltk.word_tokenize(str(sentence))))
    return word_count_dict

In [43]:
products["word_count"] = products["review"].apply(word_count)

In [44]:
products.head(5)

Unnamed: 0,name,review,rating,word_count
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,"{'these': 1, 'flannel': 1, 'wipes': 3, 'are': ..."
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 3, 'came': 1, 'early': 1, 'and': 3, 'wa..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'very': 1, 'soft': 1, 'and': 2, 'comfortable'..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'this': 4, 'is': 4, 'a': 2, 'product': 2, 'we..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'all': 2, 'of': 1, 'my': 1, 'kids': 2, 'have'..."


In [54]:
products.groupby("name").count().sort_values(by=['rating'], ascending=False).head(5)

Unnamed: 0_level_0,review,rating,word_count
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Vulli Sophie the Giraffe Teether,779,785,785
"Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L",560,562,562
Infant Optics DXR-5 2.4 GHz Digital Video Baby Monitor with Night Vision,558,561,561
Baby Einstein Take Along Tunes,545,547,547
"Cloud b Twilight Constellation Night Light, Turtle",517,520,520


## Explore Vulli Sophie

In [74]:
giraffe_reviews = products[products["name"] == "Vulli Sophie the Giraffe Teether"]
len(giraffe_reviews)

785

In [88]:
#giraffe_reviews.groupby("rating").count().sort_values(by=["name"], ascending=False)

giraffe_reviews_freq = giraffe_reviews.groupby(['rating', 'name']).size().reset_index(name='counts')
giraffe_reviews_freq

Unnamed: 0,rating,name,counts
0,1,Vulli Sophie the Giraffe Teether,56
1,2,Vulli Sophie the Giraffe Teether,37
2,3,Vulli Sophie the Giraffe Teether,62
3,4,Vulli Sophie the Giraffe Teether,95
4,5,Vulli Sophie the Giraffe Teether,535


# Build a sentiment classifier

In [106]:
products.groupby("rating").count()#.sort_values(by=['rating'], ascending=False).head(5)

Unnamed: 0_level_0,name,review,word_count
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,15167,15116,15183
2,11294,11277,11310
3,16757,16727,16779
4,33159,33099,33205
5,106836,106483,107054


## Define what's a positive and negative sentiment

In [107]:
# ignore all 3* review
products = products[products["rating"] != 3]

In [111]:
# posivite sentiment = 4* or 5* reviews
def sentiment(x):
    sentiment = 0
    if x >= 4:
        sentiment = 1
    return sentiment
products["sentiment"] = products["rating"].apply(sentiment)

In [112]:
products.head(5)

Unnamed: 0,name,review,rating,word_count,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 3, 'came': 1, 'early': 1, 'and': 3, 'wa...",1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'very': 1, 'soft': 1, 'and': 2, 'comfortable'...",1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'this': 4, 'is': 4, 'a': 2, 'product': 2, 'we...",1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'all': 2, 'of': 1, 'my': 1, 'kids': 2, 'have'...",1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,"{'when': 2, 'the': 6, 'binky': 3, 'fairy': 3, ...",1


# Let's train the sentiment classifier

In [149]:
train_data, test_data = train_test_split(products, test_size=0.2, random_state=0)
dictVectorizer = DictVectorizer()
train_dict = dictVectorizer.fit_transform(train_data['word_count'])
test_dict = dictVectorizer.transform(test_data['word_count'])


In [143]:
sentiment_model = linear_model.LogisticRegression()
sentiment_model.fit(train_dict, train_data["sentiment"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Evaluate model metric

In [151]:
sentiment_model.predict(test_dict)


array([1, 1, 1, ..., 1, 0, 1])

# Apply the learned model to understand the sentiment for Giraffe

In [182]:
print(len(giraffe_reviews))
giraffe_reviews_dict = dictVectorizer.transform(giraffe_reviews['word_count'])
giraffe_predicted = sentiment_model.predict_proba(giraffe_reviews_dict)[:,1]
#giraffe_reviews["predicted_sentiment"] = sentiment_model.predict_log_proba(giraffe_reviews_dict)
print(len(giraffe_predicted))
#print(giraffe_predicted)
giraffe_reviews["predicted_sentiment"] = giraffe_predicted.tolist()

785
785


## Sort the review based on predicted sentiment and explore

In [170]:
giraffe_reviews = giraffe_reviews.sort_values(by=['predicted_sentiment'], ascending=False)
giraffe_reviews.head(5)

Unnamed: 0,name,review,rating,word_count,predicted_sentiment
34892,Vulli Sophie the Giraffe Teether,"Sophie, oh Sophie, your time has come. My gran...",5,"{'sophie': 17, ',': 8, 'oh': 1, 'your': 1, 'ti...",1.0
34434,Vulli Sophie the Giraffe Teether,My Mom-in-Law bought Sophie for my son when he...,5,"{'my': 7, 'mom-in-law': 1, 'bought': 1, 'sophi...",1.0
34442,Vulli Sophie the Giraffe Teether,"Yes, it\'s imported. Yes, it\'s expensive. And...",5,"{'yes': 3, ',': 14, 'it\': 4, ''s': 6, 'import...",1.0
34746,Vulli Sophie the Giraffe Teether,Sophie the Giraffe is the perfect teething toy...,5,"{'sophie': 5, 'the': 8, 'giraffe': 1, 'is': 7,...",1.0
34938,Vulli Sophie the Giraffe Teether,My nephews and my four kids all had Sophie in ...,5,"{'my': 2, 'nephews': 1, 'and': 4, 'four': 1, '...",1.0


In [180]:
print(giraffe_reviews["review"].iloc[0])
print(giraffe_reviews["review"].iloc[1])

Sophie, oh Sophie, your time has come. My granddaughter, Violet is 5 months old and starting to teeth. What joy little Sophie brings to Violet. Sophie is made of a very pliable rubber that is sturdy but not tough. It is quite easy for Violet to twist Sophie into unheard of positions to get Sophie into her mouth. The little nose and hooves fit perfectly into small mouths, and the drooling has purpose. The paint on Sophie is food quality.Sophie was born in 1961 in France. The maker had wondered why there was nothing available for babies and made Sophie from the finest rubber, phthalate-free on St Sophie\'s Day, thus the name was born. Since that time millions of Sophie\'s populate the world. She is soft and for babies little hands easy to grasp. Violet especially loves the bumpy head and horns of Sophie. Sophie has a long neck that easy to grasp and twist. She has lovely, sizable spots that attract Violet\'s attention. Sophie has happy little squeaks that bring squeals of delight from Vi

## Show most negative review

In [181]:
print(giraffe_reviews["review"].iloc[-1])
print(giraffe_reviews["review"].iloc[-2])

I wanted to love this product and was excited to buy it when I became pregnant but am now hesitant to let my baby use it after reading about the recall in Europe. Apparently, as I understand it, their toxin standards of measurement are lower than ours so they have not been recalled here (apparently we are OK with low levels of nitrates in the toys our children put in their mouths, but Europeans are not...hmmm)...Be that as it may, toxins registering even CLOSE to a dangerous level made me nervous about using. After digging around online I did discover that the company claims to have changed the product after a certain date and lists manufacturing codes so you can check yours (those listed were made after a certain date and are said to be safer). Sadly mine was not made after the &#34;improved&#34; date but I could not return it because there was no formal recall in our country. I considered returning it and hunting for one with an approved manufacturing date but man that was just too m

# Assignment :

In [185]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']


In [197]:
def awesome_word_count(_dict):
    word = "awesome"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def great_word_count(_dict):
    word = "great"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count
    
def fantastic_word_count(_dict):
    word = "fantastic"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def amazing_word_count(_dict):
    word = "amazing"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def love_word_count(_dict):
    word = "love"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def horrible_word_count(_dict):
    word = "horrible"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def awful_word_count(_dict):
    word = "awful"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def terrible_word_count(_dict):
    word = "terrible"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def wow_word_count(_dict):
    word = "wow"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def hate_word_count(_dict):
    word = "hate"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

def bad_word_count(_dict):
    word = "bad"
    count = 0
    if word in _dict:
        count = _dict[word]
    return count

In [200]:
products['awesome'] = products['word_count'].apply(awesome_word_count)
products['great'] = products['word_count'].apply(great_word_count)
products['fantastic'] = products['word_count'].apply(fantastic_word_count)
products['amazing'] = products['word_count'].apply(amazing_word_count)
products['love'] = products['word_count'].apply(love_word_count)
products['horrible'] = products['word_count'].apply(horrible_word_count)
products['bad'] = products['word_count'].apply(bad_word_count)
products['terrible'] = products['word_count'].apply(terrible_word_count)
products['awful'] = products['word_count'].apply(awful_word_count)
products['wow'] = products['word_count'].apply(wow_word_count)
products['hate'] = products['word_count'].apply(hate_word_count)
products.head(5)

Unnamed: 0,name,review,rating,word_count,sentiment,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,"{'it': 3, 'came': 1, 'early': 1, 'and': 3, 'wa...",1,0,0,0,0,1,0,0,0,0,0,0
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,"{'very': 1, 'soft': 1, 'and': 2, 'comfortable'...",1,0,0,0,0,0,0,0,0,0,0,0
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,"{'this': 4, 'is': 4, 'a': 2, 'product': 2, 'we...",1,0,0,0,0,2,0,0,0,0,0,0
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,"{'all': 2, 'of': 1, 'my': 1, 'kids': 2, 'have'...",1,0,1,0,0,0,0,0,0,0,0,0
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,"{'when': 2, 'the': 6, 'binky': 3, 'fairy': 3, ...",1,0,1,0,0,0,0,0,0,0,0,0


In [204]:
products[selected_words].sum(axis=0)

awesome       3810
great        54807
fantastic     1631
amazing       2575
love         41679
horrible      1091
bad           4086
terrible      1114
awful          665
wow            407
hate          1094
dtype: int64

In [207]:
selected_featutes_product = products[selected_words]

In [208]:
selected_featutes_train_data, selected_featutes_test_data = train_test_split(products, test_size=0.2, random_state=0)

In [263]:
selected_featutes_sentiment_model = linear_model.LogisticRegression()
selected_featutes_sentiment_model.fit(selected_featutes_train_data[selected_words], selected_featutes_train_data["sentiment"])
print(selected_featutes_sentiment_model.coef_)

[[ 1.16773792  0.86376671  0.93764443  1.07139851  1.3894016  -2.27795248
  -0.98855992 -2.20591631 -2.07791281 -0.0734577  -1.42913533]]


In [265]:
print(selected_featutes_sentiment_model.intercept_)
#selected_featutes_sentiment_model.coef_[0].sort()
print(selected_featutes_sentiment_model.coef_)

[1.34220025]
[[ 1.16773792  0.86376671  0.93764443  1.07139851  1.3894016  -2.27795248
  -0.98855992 -2.20591631 -2.07791281 -0.0734577  -1.42913533]]


In [242]:
print(selected_words)
horrible = -2.27795248
love = 1.3894016

['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']


# diaper_champ_reviews

In [243]:
diaper_champ_reviews = products[products["name"] == "Baby Trend Diaper Champ"]

In [247]:
selected_featutes_sentiment_model.predict_proba(diaper_champ_reviews[selected_words][0:1])[:,1]

array([0.79285154])

# Most positive review for ‘Baby Trend Diaper Champ’ according to the sentiment_model 

In [248]:
diaper_champ_reviews_dict = dictVectorizer.transform(diaper_champ_reviews['word_count'])
diaper_champ_reviews_predicted = sentiment_model.predict_proba(diaper_champ_reviews_dict)[:,1]
diaper_champ_reviews["predicted_sentiment"] = diaper_champ_reviews_predicted.tolist()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [249]:
diaper_champ_reviews = diaper_champ_reviews.sort_values(by=['predicted_sentiment'], ascending=False)
diaper_champ_reviews.head(5)

Unnamed: 0,name,review,rating,word_count,sentiment,awesome,great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,predicted_sentiment
376,Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper pa...",5,"{'this': 3, 'is': 10, 'absolutely': 1, ',': 10...",1,0,0,0,0,2,0,0,0,0,0,0,1.0
320,Baby Trend Diaper Champ,I originally put this item on my baby registry...,5,"{'i': 10, 'originally': 1, 'put': 1, 'this': 4...",1,0,0,0,0,0,0,0,0,0,0,0,1.0
414,Baby Trend Diaper Champ,We have been using our Diaper Champ for almost...,5,"{'we': 8, 'have': 3, 'been': 2, 'using': 1, 'o...",1,0,0,0,0,0,0,0,0,0,0,0,1.0
571,Baby Trend Diaper Champ,We did alot of research on diaper pails before...,2,"{'we': 11, 'did': 1, 'alot': 1, 'of': 5, 'rese...",0,0,0,0,0,0,0,0,0,0,0,0,1.0
451,Baby Trend Diaper Champ,"As a first time mother, I wanted to get the be...",5,"{'as': 1, 'a': 5, 'first': 1, 'time': 1, 'moth...",1,0,0,0,0,1,0,0,0,0,0,0,1.0


In [250]:
print(diaper_champ_reviews["review"].iloc[0])

This is absolutely, by far, the best diaper pail money can buy.  Never do we detect a diaper odor (and my husband has a very sensitive sense of smell and is usually very quick to complain about such things).  For those who say they have a problem with the Diaper Champ getting stuck...the ONLY time this ever happens to us is when the bag is full and needs to be changed.  We love that it uses regular kitchen trash bags, makes it much more economical.  We have not found that we need to worry about frequent emptying or cleaning.  We just leave the Champ to do its job until the mechanism begins to feel like it\'s getting stuck...then we change the bag.  For us this means about once a week.  Not only is the Champ EASY to use, it\'s kind of fun.  Before our daughter was born we really worried about whether the diaper pail we chose would be effective enough for us because my husband is so sensitive to smells.  But she\'s two months old now and we still just can\'t say enough good things about 

In [251]:
print(diaper_champ_reviews["review"].iloc[-1])

This is the worst diaper pail ever!  It was great for the first couple weeks, but then it began to smell.  I cleaned the inside and outside with 409 again and again, but the smell would return with the first diaper I put in.  Then to make matters worse my 16 month old figured out it was fun to put toys in the hole where you put the diapers and try to flip it the way I did with his dirty diapers.  I had a Diaper Genie that broke and was hopeful that this less complicated diaper disposal system would work as well, but it does not!


# Accuracy

In [252]:
sentiment_model_score = sentiment_model.score(test_dict, test_data["sentiment"])
print(sentiment_model_score)

0.9313363917123925


In [253]:
selected_featutes_sentiment_model_score = selected_featutes_sentiment_model.score(selected_featutes_test_data[selected_words], selected_featutes_test_data["sentiment"])
print(selected_featutes_sentiment_model_score)


0.564660729813199


In [254]:


sentiment_model.predict_proba(diaper_champ_reviews[selected_words][0:1])[:,1]

ValueError: X has 11 features per sample; expecting 116995

In [257]:
print(test_data.count())
test_data.groupby("name").count().sort_values(by=['rating'], ascending=False).head(5)

name          33301
review        33191
rating        33351
word_count    33351
sentiment     33351
dtype: int64


Unnamed: 0_level_0,review,rating,word_count,sentiment
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Vulli Sophie the Giraffe Teether,146,148,148,148
Baby Einstein Take Along Tunes,107,107,107,107
"Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L",105,106,106,106
"Fisher-Price Booster Seat, Blue/Green/Gray",98,98,98,98
Fisher-Price Rainforest Jumperoo,94,94,94,94


In [258]:
test_data[test_data["rating"] >= 4].count()

name          27941
review        27848
rating        27989
word_count    27989
sentiment     27989
dtype: int64

In [267]:
27941/33301

0.8390438725563797

In [264]:
coef_dict = {}
for coef, feat in zip(selected_featutes_sentiment_model.coef_[0],selected_words):
    coef_dict[feat] = coef
    
print(coef_dict)

{'awesome': 1.1677379241165828, 'great': 0.8637667064569866, 'fantastic': 0.937644430430786, 'amazing': 1.0713985121349374, 'love': 1.389401598242565, 'horrible': -2.2779524796018924, 'bad': -0.9885599155292636, 'terrible': -2.2059163055521256, 'awful': -2.0779128090486285, 'wow': -0.07345769625171285, 'hate': -1.429135332203809}
