In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('amazon_alexa.tsv', sep = '\t')

In [3]:
data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [4]:
data.columns

Index(['rating', 'date', 'variation', 'verified_reviews', 'feedback'], dtype='object')

In [5]:
data['feedback'].value_counts()

1    2893
0     257
Name: feedback, dtype: int64

In [6]:
data.corr()

Unnamed: 0,rating,feedback
rating,1.0,0.861968
feedback,0.861968,1.0


In [7]:
data = data[['verified_reviews', 'rating']]

In [8]:
data

Unnamed: 0,verified_reviews,rating
0,Love my Echo!,5
1,Loved it!,5
2,"Sometimes while playing a game, you can answer...",4
3,I have had a lot of fun with this thing. My 4 ...,5
4,Music,5
...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",5
3146,"Listening to music, searching locations, check...",5
3147,"I do love these things, i have them running my...",5
3148,Only complaint I have is that the sound qualit...,5


In [9]:
import nltk

In [10]:
from nltk import word_tokenize

In [11]:
from nltk.corpus import stopwords

In [12]:
stopwords_list = set(stopwords.words('English'))

In [13]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [14]:
ps = PorterStemmer()
lmt = WordNetLemmatizer()

In [15]:
import re

In [16]:
clean_reviews_list = []

In [17]:
words = ['alexa', 'bluetooth', 'cellphone', 'ipad', 'bedroom', 'house', 'home', 'light', 'dinosaur', 'amazon', 'spotify',
        'computer', 'kitchen', 'daughter', 'father', 'mother', 'sister', 'brother', 'elder', 'ship', 'product', 'plus' 'minus'
        'household', 'new', 'member', 'color', 'wife', 'st', 'etc', 'tube', 'family', 'prime', 'day']
for each_review in range(len(data)):
    current_review = data['verified_reviews'].values[each_review]
    current_review = re.sub('[^a-zA-Z]', ' ', current_review)
    list_of_words = word_tokenize(current_review)
    clean_review = ''
    for each_word in list_of_words:
        each_word = each_word.lower()
        if not each_word in stopwords_list:
            if not each_word in words:
                each_word = ps.stem(each_word)
                clean_review = clean_review + ' ' + each_word
    clean_reviews_list.append(clean_review)

In [18]:
clean_reviews_list

[' love echo',
 ' love',
 ' sometim play game answer question correctli say got wrong answer like abl turn light away',
 ' lot fun thing yr old learn dinosaur control light play game like categori nice sound play music well',
 ' music',
 ' receiv echo gift need anoth someth play music easili access found smart speaker wait see els',
 ' without use mani featur see use great alarm u r almost deaf hear alarm live room reason enough keep fun ask random question hear respons seem smartbon polit yet',
 ' think th one purchas work get one everi room realli like featur offer specifili play music echo control light throughout',
 ' look great',
 ' love listen song heard sinc childhood get news weather inform great',
 ' sent year old dad talk constantli',
 ' love learn knew thing eveyday still figur everyth work far easi use understand make laugh time',
 ' purchas knee problem give someth tri come get around fast like enjoy littl big thing play song time cook',
 ' love love love',
 ' expect',
 ' 

In [19]:
data['clean_review'] = clean_reviews_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['clean_review'] = clean_reviews_list


In [20]:
data

Unnamed: 0,verified_reviews,rating,clean_review
0,Love my Echo!,5,love echo
1,Loved it!,5,love
2,"Sometimes while playing a game, you can answer...",4,sometim play game answer question correctli s...
3,I have had a lot of fun with this thing. My 4 ...,5,lot fun thing yr old learn dinosaur control l...
4,Music,5,music
...,...,...,...
3145,"Perfect for kids, adults and everyone in betwe...",5,perfect kid adult everyon
3146,"Listening to music, searching locations, check...",5,listen music search locat check time look wea...
3147,"I do love these things, i have them running my...",5,love thing run entir tv light thermostat fron...
3148,Only complaint I have is that the sound qualit...,5,complaint sound qualiti great mostli use comm...


In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
le = LabelEncoder()

In [23]:
data['rating'] = le.fit_transform(data['rating'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating'] = le.fit_transform(data['rating'])


In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv = CountVectorizer()

In [26]:
cv.fit(data['clean_review'])

CountVectorizer()

In [27]:
sparse_matrix = cv.transform(data['clean_review'])

In [28]:
sparse_matrix

<3150x2750 sparse matrix of type '<class 'numpy.int64'>'
	with 33029 stored elements in Compressed Sparse Row format>

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x_train, x_test, y_train, y_test = train_test_split(sparse_matrix, data['rating'], random_state = 0, test_size = 0.20)

In [31]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((2520, 2750), (630, 2750), (2520,), (630,))

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
nb = MultinomialNB()

In [34]:
nb.fit(x_train, y_train)

MultinomialNB()

In [35]:
y_pred = nb.predict(x_test)

In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [37]:
accuracy_score(y_test, y_pred)

0.7428571428571429

In [38]:
confusion_matrix(y_test, y_pred)

array([[ 12,   2,   1,   5,  13],
       [  1,   0,   0,   9,  11],
       [  2,   0,   5,   9,  23],
       [  1,   0,   0,  18,  71],
       [  0,   1,   0,  13, 433]], dtype=int64)