# Reading files and aggregating info with pandas

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('amazon_baby.csv')
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [13]:
#groupby name and aggregate count and sort descending
df.groupby(by='name')['review'].count().sort_values(ascending=False)

name
Vulli Sophie the Giraffe Teether                                                    779
Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L                                 560
Infant Optics DXR-5 2.4 GHz Digital Video Baby Monitor with Night Vision            558
Baby Einstein Take Along Tunes                                                      545
Cloud b Twilight Constellation Night Light, Turtle                                  517
                                                                                   ... 
Baloon Decorating Strip                                                               0
Lamaze Play &amp; Grow, Torin the T-Rex                                               0
Tommee Tippee Explora Trainer Cup- Random Colors Pink, Purple                         0
Pacifiers Cool Stage 1 Twin Pack by Born Free                                         0
Complete Swimava Pool &amp; Ring Set (Free Shipping &amp; Free Reusable Diaper!)      0
Name: review, Length: 32417

# Word count processing with pandas

In [14]:
# define some key words for features
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [56]:
def create_word_dictionary(review, selected_words):
    #row level counters
    row_word_dict = {}
    
    #init dictionary with 0 count
    for wd in selected_words:
        row_word_dict[wd] = 0
    
    #split sentence into words
    word_lst = review.split()
    
    #clean up punctuations
    for w in word_lst:
        w = w.strip()
        
    #create row count dictionary
    for word in word_lst:
        if word in row_word_dict:
            row_word_dict[word] += 1
            
    return row_word_dict

# parsing each review line to get word counts

In [59]:
#global cnt
global_word_dict = {}
tuple_lst = []

#init dictionary with 0 count
for wd in selected_words:
    global_word_dict[wd] = 0

#row level cnt
awesome_cnt_lst = []
amazing_cnt_lst = []
wow_cnt_lst = []
fantastic_cnt_lst = []
love_cnt_lst = []
great_cnt_lst = []
horrible_cnt_lst = []
bad_cnt_lst = []
terrible_cnt_lst = []
awful_cnt_lst = []
hate_cnt_lst = []

for review in df['review']:
    #clean words
    row_word_dict = create_word_dictionary(str(review), selected_words)
    
    #each row to add to both counts
    awesome_cnt_lst.append(row_word_dict['awesome'])
    amazing_cnt_lst.append(row_word_dict['amazing'])
    wow_cnt_lst.append(row_word_dict['wow'])
    fantastic_cnt_lst.append(row_word_dict['fantastic'])
    love_cnt_lst.append(row_word_dict['love'])
    great_cnt_lst.append(row_word_dict['great'])
    horrible_cnt_lst.append(row_word_dict['horrible'])
    bad_cnt_lst.append(row_word_dict['bad'])
    terrible_cnt_lst.append(row_word_dict['terrible'])
    awful_cnt_lst.append(row_word_dict['awful'])
    hate_cnt_lst.append(row_word_dict['hate'])
    
    #global_word_dict
    for key,val in row_word_dict.items():
        global_word_dict[key] += val
    
#only use selected_words to aggregate
for key,val in global_word_dict.items():
    tuple_lst.append((val,key))

print(tuple_lst)

[(1683, 'awesome'), (37056, 'great'), (807, 'fantastic'), (1164, 'amazing'), (33667, 'love'), (637, 'horrible'), (3599, 'bad'), (659, 'terrible'), (337, 'awful'), (54, 'wow'), (1089, 'hate')]


In [60]:
sorted(tuple_lst)

[(54, 'wow'),
 (337, 'awful'),
 (637, 'horrible'),
 (659, 'terrible'),
 (807, 'fantastic'),
 (1089, 'hate'),
 (1164, 'amazing'),
 (1683, 'awesome'),
 (3599, 'bad'),
 (33667, 'love'),
 (37056, 'great')]

# Adding extracted features data into dataframe

In [63]:
#put back these words as features columns
df['awesome'] = awesome_cnt_lst
df['amazing'] = amazing_cnt_lst
df['wow'] = wow_cnt_lst
df['fantastic'] = fantastic_cnt_lst
df['love'] = love_cnt_lst
df['great'] = great_cnt_lst
df['horrible'] = horrible_cnt_lst
df['bad'] = bad_cnt_lst
df['terrible'] = terrible_cnt_lst
df['awful'] = awful_cnt_lst
df['hate'] = hate_cnt_lst

df.tail()

Unnamed: 0,name,review,rating,awesome,amazing,wow,fantastic,love,great,horrible,bad,terrible,awful,hate
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5,0,0,0,0,0,2,0,0,0,0,0
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5,0,0,0,0,0,1,0,0,0,0,0
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5,0,0,0,0,0,2,0,0,0,0,0
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5,0,0,0,0,0,0,0,0,0,0,0
183530,Best 2 Pack Baby Car Shade for Kids - Window S...,I love this product very mush . I have bought ...,5,0,0,0,0,1,0,0,0,0,0,0


# train test split and training preps

In [64]:
#train test split 80%, 20%
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2)
print("Training data set has {} rows".format(len(df_train)))
print(df_train.head())

Training data set has 146824 rows
                                                     name  \
28099   DaVinci Kalani 4-in-1 Convertible Crib with To...   
32711   Medela Pump In Style 9 Volt Advanced Breastpum...   
20322   NUK Toddler Tooth and Gum Cleanser, 1.4 Ounce,...   
155609  The First Years 6 Count Take &amp; Toss Snack ...   
179351           Mud Pie Forest Animal Stroller Toys, Fox   

                                                   review  rating  awesome  \
28099   I have 3 of these cribs. Two of them have been...       5        0   
32711   I have no problems with this battery pack; wor...       5        0   
20322   My son is 15 months and has had teeth since we...       3        0   
155609  i DONT like the fact that they dont come with ...       4        0   
179351  This will work great on our stroller or travel...       5        0   

        amazing  wow  fantastic  love  great  horrible  bad  terrible  awful  \
28099         0    0          0     0      0      