In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/AllProductReviews.csv.zip')
X = df['ReviewTitle']
X

0                   Honest review of an edm music lover\n
1                   Unreliable earphones with high cost\n
2                              Really good and durable.\n
3                       stopped working in just 14 days\n
4        Just Awesome Wireless Headphone under 1000...😉\n
                               ...                       
14332                                              Good\n
14333                                   Amazing Product\n
14334                                           Not bad\n
14335                                    a good product\n
14336             Average headphones , n overrated name\n
Name: ReviewTitle, Length: 14337, dtype: object

In [3]:
df['Sentiment'] = df['ReviewStar'].apply(lambda x: 1 if x>2 else 0)
Y = df['Sentiment']
Y

0        1
1        0
2        1
3        0
4        1
        ..
14332    1
14333    1
14334    0
14335    1
14336    0
Name: Sentiment, Length: 14337, dtype: int64

In [4]:
X_train = X[:10000]
Y_train = Y[:10000]

X_train, Y_train

(0                  Honest review of an edm music lover\n
 1                  Unreliable earphones with high cost\n
 2                             Really good and durable.\n
 3                      stopped working in just 14 days\n
 4       Just Awesome Wireless Headphone under 1000...😉\n
                               ...                       
 9995                           Not as good as expected\n
 9996                                        Nice one..\n
 9997                                   Worth the price\n
 9998                                         Defective\n
 9999                            Improve reinforcements\n
 Name: ReviewTitle, Length: 10000, dtype: object, 0       1
 1       0
 2       1
 3       0
 4       1
        ..
 9995    1
 9996    1
 9997    1
 9998    0
 9999    1
 Name: Sentiment, Length: 10000, dtype: int64)

In [5]:
X_test = X[10000:]
Y_test = Y[10000:]

X_test, Y_test

(10000                                         Four Stars\n
 10001                               Built Quality is bad\n
 10002    Has been using for kast one year and Happy wit...
 10003                            Good for listening song\n
 10004                                           Must buy\n
                                ...                        
 14332                                               Good\n
 14333                                    Amazing Product\n
 14334                                            Not bad\n
 14335                                     a good product\n
 14336              Average headphones , n overrated name\n
 Name: ReviewTitle, Length: 4337, dtype: object, 10000    1
 10001    0
 10002    1
 10003    1
 10004    1
         ..
 14332    1
 14333    1
 14334    0
 14335    1
 14336    0
 Name: Sentiment, Length: 4337, dtype: int64)

#Data Cleaning

In [6]:
from nltk.tokenize import RegexpTokenizer

In [7]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [10]:
def getCleanedText(text):
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens]
    
    clean_text = " ".join(stemmed_tokens)
    
    return clean_text

In [11]:
x_clean = [getCleanedText(i) for i in X_train]
xt_clean = [getCleanedText(i) for i in X_test]

In [12]:
x_clean

['honest review edm music lover',
 'unreli earphon high cost',
 'realli good durabl',
 'stop work 14 day',
 'awesom wireless headphon 1000',
 'charg port work',
 'love color sound',
 'great sound worst mic',
 'batteri life good enough keep go day',
 'bad durab',
 'run becom comfort',
 'disappoint',
 'extrem good bt earphon loud super bass',
 'product good qualiti',
 'expect 0th day purchas',
 'good earphon good sound qualiti build qualiti',
 'good sound bad design',
 'good product decent pro',
 'averag product buy',
 'overal good worth',
 'budget friendli',
 'receiv product one side earphon work',
 'satisfi total',
 'buy',
 'unabl connect two devic simultan',
 'read care',
 'defect product',
 'dont buy',
 'realli boat beat other price rang 5 6k',
 'good',
 'realli cheep qualiti',
 'awesom bass batteri life',
 'review updat product fail month',
 'ok qualiti',
 'good sound qualiti',
 'amaz product go',
 'frequent failur',
 'puchas sub 1000 price offer otherwis expens qualiti',
 'averag p

#Vectorization


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer(ngram_range=(1,2))

In [15]:
x_vec = cv.fit_transform(x_clean).toarray()

In [16]:
x_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
print(cv.get_feature_names())

['00', '00bt', '00bt sennheis', '0th', '0th day', '10', '10 10', '10 awesom', '10 bass', '10 build', '10 day', '10 good', '10 hr', '10 min', '10 minut', '10 overal', '10 sound', '10 vocal', '100', '100 baat', '100 nois', '100 paisa', '100 percent', '100 satisfact', '100 satisfi', '100 valu', '100 worth', '1000', '1000 1300', '1000 1500', '1000 grab', '1000 less', '1000 price', '1000 rupe', '10000', '1000r', '1049', '1099', '10day', '10mtr', '10mtr definit', '10th', '10th day', '11', '11 day', '11 month', '1100', '1100 bought', '1100 margin', '1100 produ', '1100 rupe', '1100r', '12', '12 day', '1200', '1200 go', '1200 inr', '1299r', '13', '13 work', '1300', '1300 buck', '1399', '1399 price', '14', '14 day', '1400', '1400 buck', '1400 worth', '1499', '1499 amazon', '1499 best', '1499 sound', '1499 worth', '15', '15 20', '15 day', '15 min', '15 minut', '1500', '1500 1600', '1500 2000', '1500 black', '1500 buck', '1500 buy', '1500 go', '1500 good', '1500 headset', '1500 product', '1500 rs'

In [18]:
xt_vect = cv.transform(xt_clean).toarray()



#Multinomial Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB

In [20]:
mn = MultinomialNB()

In [21]:
mn.fit(x_vec,Y_train)

MultinomialNB()

In [22]:
y_pred = mn.predict(xt_vect)

In [23]:
y_pred

array([1, 1, 1, ..., 0, 1, 1])

In [24]:
result = np.array([list(X_test),list(Y_test),list(y_pred)])
result = np.transpose(result)
result = pd.DataFrame(result, columns = ['ReviewTitle', 'Sentiment', 'Prediction'])
result

Unnamed: 0,ReviewTitle,Sentiment,Prediction
0,Four Stars\n,1,1
1,Built Quality is bad\n,0,1
2,Has been using for kast one year and Happy wit...,1,1
3,Good for listening song\n,1,1
4,Must buy\n,1,1
...,...,...,...
4332,Good\n,1,1
4333,Amazing Product\n,1,1
4334,Not bad\n,0,0
4335,a good product\n,1,1


In [25]:
from sklearn.metrics import accuracy_score

accuracy_score(Y_test, y_pred)

0.8522019829375144