# Reading files and aggregating info with pandas

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('amazon_baby.csv')
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [13]:
#groupby name and aggregate count and sort descending
df.groupby(by='name')['review'].count().sort_values(ascending=False)

name
Vulli Sophie the Giraffe Teether                                                    779
Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L                                 560
Infant Optics DXR-5 2.4 GHz Digital Video Baby Monitor with Night Vision            558
Baby Einstein Take Along Tunes                                                      545
Cloud b Twilight Constellation Night Light, Turtle                                  517
                                                                                   ... 
Baloon Decorating Strip                                                               0
Lamaze Play &amp; Grow, Torin the T-Rex                                               0
Tommee Tippee Explora Trainer Cup- Random Colors Pink, Purple                         0
Pacifiers Cool Stage 1 Twin Pack by Born Free                                         0
Complete Swimava Pool &amp; Ring Set (Free Shipping &amp; Free Reusable Diaper!)      0
Name: review, Length: 32417

# Word count processing with pandas

In [14]:
# define some key words for features
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [56]:
def create_word_dictionary(review, selected_words):
    #row level counters
    row_word_dict = {}
    
    #init dictionary with 0 count
    for wd in selected_words:
        row_word_dict[wd] = 0
    
    #split sentence into words
    word_lst = review.split()
    
    #clean up punctuations
    for w in word_lst:
        w = w.strip()
        
    #create row count dictionary
    for word in word_lst:
        if word in row_word_dict:
            row_word_dict[word] += 1
            
    return row_word_dict

# parsing each review line to get word counts

In [59]:
#global cnt
global_word_dict = {}
tuple_lst = []

#init dictionary with 0 count
for wd in selected_words:
    global_word_dict[wd] = 0

#row level cnt
awesome_cnt_lst = []
amazing_cnt_lst = []
wow_cnt_lst = []
fantastic_cnt_lst = []
love_cnt_lst = []
great_cnt_lst = []
horrible_cnt_lst = []
bad_cnt_lst = []
terrible_cnt_lst = []
awful_cnt_lst = []
hate_cnt_lst = []

for review in df['review']:
    #clean words
    row_word_dict = create_word_dictionary(str(review), selected_words)
    
    #each row to add to both counts
    awesome_cnt_lst.append(row_word_dict['awesome'])
    amazing_cnt_lst.append(row_word_dict['amazing'])
    wow_cnt_lst.append(row_word_dict['wow'])
    fantastic_cnt_lst.append(row_word_dict['fantastic'])
    love_cnt_lst.append(row_word_dict['love'])
    great_cnt_lst.append(row_word_dict['great'])
    horrible_cnt_lst.append(row_word_dict['horrible'])
    bad_cnt_lst.append(row_word_dict['bad'])
    terrible_cnt_lst.append(row_word_dict['terrible'])
    awful_cnt_lst.append(row_word_dict['awful'])
    hate_cnt_lst.append(row_word_dict['hate'])
    
    #global_word_dict
    for key,val in row_word_dict.items():
        global_word_dict[key] += val
    
#only use selected_words to aggregate
for key,val in global_word_dict.items():
    tuple_lst.append((val,key))

print(tuple_lst)

[(1683, 'awesome'), (37056, 'great'), (807, 'fantastic'), (1164, 'amazing'), (33667, 'love'), (637, 'horrible'), (3599, 'bad'), (659, 'terrible'), (337, 'awful'), (54, 'wow'), (1089, 'hate')]


In [60]:
sorted(tuple_lst)

[(54, 'wow'),
 (337, 'awful'),
 (637, 'horrible'),
 (659, 'terrible'),
 (807, 'fantastic'),
 (1089, 'hate'),
 (1164, 'amazing'),
 (1683, 'awesome'),
 (3599, 'bad'),
 (33667, 'love'),
 (37056, 'great')]

# Adding extracted features data into dataframe

In [63]:
#put back these words as features columns
df['awesome'] = awesome_cnt_lst
df['amazing'] = amazing_cnt_lst
df['wow'] = wow_cnt_lst
df['fantastic'] = fantastic_cnt_lst
df['love'] = love_cnt_lst
df['great'] = great_cnt_lst
df['horrible'] = horrible_cnt_lst
df['bad'] = bad_cnt_lst
df['terrible'] = terrible_cnt_lst
df['awful'] = awful_cnt_lst
df['hate'] = hate_cnt_lst

df.tail()

Unnamed: 0,name,review,rating,awesome,amazing,wow,fantastic,love,great,horrible,bad,terrible,awful,hate
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5,0,0,0,0,0,2,0,0,0,0,0
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5,0,0,0,0,0,1,0,0,0,0,0
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5,0,0,0,0,0,2,0,0,0,0,0
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5,0,0,0,0,0,0,0,0,0,0,0
183530,Best 2 Pack Baby Car Shade for Kids - Window S...,I love this product very mush . I have bought ...,5,0,0,0,0,1,0,0,0,0,0,0


# train test split and training preps

In [66]:
#get rid of rating of 3 as they are neutral
df = df[df['rating'] != 3]
len(df)

166752

In [67]:
#create new sentiment column and assign binary value
df['sentiment'] = df['rating'] >= 4
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,name,review,rating,awesome,amazing,wow,fantastic,love,great,horrible,bad,terrible,awful,hate,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,0,0,0,0,1,0,0,0,0,0,0,True
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,0,0,0,0,0,0,0,0,0,0,0,True
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,0,0,0,0,2,0,0,0,0,0,0,True
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,0,0,0,0,0,1,0,0,0,0,0,True
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,0,0,0,0,0,1,0,0,0,0,0,True


In [72]:
#bad sentiments examples
df[df['rating']==2].head()

Unnamed: 0,name,review,rating,awesome,amazing,wow,fantastic,love,great,horrible,bad,terrible,awful,hate,sentiment
21,Nature\'s Lullabies Second Year Sticker Calendar,I only purchased a second-year calendar for my...,2,0,0,0,0,0,0,0,0,0,0,0,False
41,"SoftPlay Giggle Jiggle Funbook, Happy Bear",This bear is absolutely adorable and I would g...,2,0,0,0,0,0,0,0,0,0,0,0,False
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,0,0,0,0,0,0,0,0,0,0,0,False
78,Cloth Diaper Pins Stainless Steel Traditional ...,These were good quality--worked fine--heavy d...,2,0,0,0,0,0,0,0,0,0,0,0,False
80,Cloth Diaper Pins Stainless Steel Traditional ...,"While the diaper pins are attractive, the meta...",2,0,0,0,0,0,0,0,0,0,0,0,False


In [73]:
#train test split 80%, 20%
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2)
print("Training data set has {} rows".format(len(df_train)))
df_train.head()

Training data set has 133401 rows


Unnamed: 0,name,review,rating,awesome,amazing,wow,fantastic,love,great,horrible,bad,terrible,awful,hate,sentiment
87571,Fisher-Price Precious Planet Blue Sky Jumperoo,"Well not really, but you get the idea. Without...",5,0,0,0,0,0,0,0,0,0,0,0,True
70029,Boppy Changing Pad Cover with Waterproof Liner...,I love this changeing pad cover. It is so sof...,5,0,0,0,0,2,1,0,0,0,0,0,True
65566,Snappi Cloth Diaper Fasteners - Pack of 3 (Min...,This is the ideal accessory for cloth diaperin...,4,0,0,0,0,0,0,0,0,0,0,0,True
106087,"Luvable Friends 6 Pack Baby Burp Cloths, Blue","these worked great when my son was a newborn, ...",4,0,0,0,0,0,1,0,0,0,0,0,True
87900,Little Tikes Cell Phone and Key Chain,We bought this for our daughter when she was a...,4,0,0,0,0,0,0,0,0,0,0,0,True


# Using logistic regression model for classification problem

In [74]:
from sklearn import linear_model
logistic_regression_model = linear_model.LogisticRegression()

In [75]:
#fit the training data using features against sentiment
logistic_regression_model.fit(df_train[selected_words], df_train['sentiment'])

LogisticRegression()

In [76]:
#mean accuracy
logistic_regression_model.score(df_train[selected_words], df_train['sentiment'])

0.8431645939685609

In [78]:
#the word weighting scores
print(selected_words)
logistic_regression_model.coef_

['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']


array([[ 0.9230992 ,  0.74944608,  0.93875894,  0.92433055,  1.34518146,
        -1.82967876, -0.9241851 , -1.79747549, -1.57020338, -0.37673419,
        -1.48006213]])