## Sentiment Analysis on Amazon Musical Instrument Reviews

In [1]:
import pandas as pd
import numpy as np 

In [2]:
data = pd.read_csv('../Documents/Musical_instruments_reviews.csv')
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [3]:
#Make a training dataframe from the first 20 reviews
train = pd.DataFrame(data[['reviewerID','reviewText','overall','summary']]).head(20)

In [4]:
train.head()

Unnamed: 0,reviewerID,reviewText,overall,summary
0,A2IBPI20UZIR0U,"Not much to write about here, but it does exac...",5.0,good
1,A14VAT5EAX3D9S,The product does exactly as it should and is q...,5.0,Jake
2,A195EZSQDW3E21,The primary job of this device is to block the...,5.0,It Does The Job Well
3,A2C00NNG1ZQQG2,Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY
4,A94QU4C90B1AX,This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.


In [5]:
#finding stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

train['stopwords'] = train['reviewText'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['reviewText','stopwords']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trmcg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,reviewText,stopwords
0,"Not much to write about here, but it does exac...",24
1,The product does exactly as it should and is q...,47
2,The primary job of this device is to block the...,29
3,Nice windscreen protects my MXL mic and preven...,14
4,This pop filter is great. It looks and perform...,10


In [6]:
#transform reviews to lower case
train['reviewText'] = train['reviewText'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['reviewText'].head()

0    not much to write about here, but it does exac...
1    the product does exactly as it should and is q...
2    the primary job of this device is to block the...
3    nice windscreen protects my mxl mic and preven...
4    this pop filter is great. it looks and perform...
Name: reviewText, dtype: object

In [7]:
#removing punctuation
train['reviewText'] = train['reviewText'].str.replace('[^\w\s]','')
train['reviewText'].head()

0    not much to write about here but it does exact...
1    the product does exactly as it should and is q...
2    the primary job of this device is to block the...
3    nice windscreen protects my mxl mic and preven...
4    this pop filter is great it looks and performs...
Name: reviewText, dtype: object

In [8]:
#remove stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

train['reviewText'] = train['reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['reviewText'].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trmcg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    much write exactly supposed filters pop sounds...
1    product exactly quite affordablei realized dou...
2    primary job device block breath would otherwis...
3    nice windscreen protects mxl mic prevents pops...
4    pop filter great looks performs like studio fi...
Name: reviewText, dtype: object

In [9]:
#finding most frequent words
freq = pd.Series(' '.join(train['reviewText']).split()).value_counts()[:10]
freq

cables     16
cable      16
use        10
guitar      9
monster     8
great       8
pedal       8
bought      6
used        6
waves       6
dtype: int64

In [10]:
#removing most frequent words by redefining 'reviewText' with those words removed
freq = list(freq.index)
train['reviewText'] = train['reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['reviewText'].head()

0    much write exactly supposed filters pop sounds...
1    product exactly quite affordablei realized dou...
2    primary job device block breath would otherwis...
3    nice windscreen protects mxl mic prevents pops...
4    pop filter looks performs like studio filter y...
Name: reviewText, dtype: object

In [11]:
#using the freq variable again to identify the 10 most rare words
freq = pd.Series(' '.join(train['reviewText']).split()).value_counts()[-10:]
freq

issues         1
description    1
break          1
reliable       1
friend         1
want           1
hold           1
unshielded     1
sake           1
length         1
dtype: int64

In [12]:
#removing the rare words
freq = list(freq.index)
train['reviewText'] = train['reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['reviewText'].head()

0    much write exactly supposed filters pop sounds...
1    product exactly quite affordablei realized dou...
2    primary job device block breath would otherwis...
3    nice windscreen protects mxl mic prevents pops...
4    pop filter looks performs like studio filter y...
Name: reviewText, dtype: object

In [13]:
#correct spelling with textblob
from textblob import TextBlob
train['reviewText'][:20].apply(lambda x: str(TextBlob(x).correct()))

0     much write exactly supposed filters pop sounds...
1     product exactly quite affordablei realized dou...
2     primary job device block breath would otherwis...
3     nice windscreen protects mal mid prevents pips...
4     pop filter looks perform like studio filter yo...
5     good another one love heavy cord gold connecti...
6     years good reason lifetime warranty worth pric...
7     run output chain input tender amp hook board t...
8     perfect epiphany sheraton ii well constructed ...
9     makes best lifetime warranty doesn hurt either...
10    makes wide array including high end initially ...
11    got needed found dont really need often rarely...
12    using large sustaining playing piano may appea...
13       love omaha ypt230 works would recommend anyone
14              home studio control mid keyboard wanted
15    keyboard want really aware option keyboard med...
16    tender perfect sometimes find bit long dont mi...
17    wanted looks alone nice looking cord know 

In [14]:
#Testing the sentiment on the first 20 reviews
train['reviewText'][:20].apply(lambda x: TextBlob(x).sentiment)

0     (0.32499999999999996, 0.44166666666666665)
1     (0.014285714285714282, 0.2928571428571428)
2                   (0.1675, 0.4342857142857143)
3                    (0.3333333333333333, 0.875)
4                                     (0.0, 0.0)
5                    (0.125, 0.5444444444444445)
6       (0.3333333333333333, 0.3523809523809524)
7     (0.006540404040404044, 0.3160606060606061)
8                                     (0.5, 0.5)
9                    (0.21250000000000002, 0.55)
10     (0.05518518518518519, 0.5247530864197532)
11                    (0.44999999999999996, 0.4)
12    (-0.19107142857142856, 0.6428571428571429)
13                                    (0.5, 0.6)
14                                    (0.0, 0.0)
15                            (0.31875, 0.36875)
16                   (0.2888888888888889, 0.575)
17                    (0.43333333333333335, 0.6)
18    (0.07537619047619047, 0.49598095238095236)
19                    (0.45, 0.6444444444444444)
Name: reviewText, dt

#### *From the lack of negative numbers from the sample above, we can see that the reviews range from neutral to positive*