# Analysing the Amazon Food Dataset and Classifying the Review as either Positive or Negative 

In [1]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve,auc
from nltk.stem.porter import PorterStemmer

In [2]:
con=sqlite3.connect("database.sqlite")

In [3]:
filtered_data=pd.read_sql_query("""
Select * 
from Reviews
where Score!=3
""",con)

In [4]:
def partition(x):
    if x<3:
        return "negative"
    return "positive"

In [5]:
actualScore=filtered_data["Score"]
positiveNegative=actualScore.map(partition)
filtered_data["Score"]=positiveNegative

In [6]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [7]:
filtered_data=filtered_data.head(5000) #Taken only 5000 records for last processing 

In [8]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [9]:
filtered_data.shape

(5000, 10)

In [10]:
# Randomly checking duplicate data
display=pd.read_sql_query("""
Select * 
from Reviews 
where Score !=3 and UserId="AR5J8UI46CURR"
Order by ProductId""",con)


In [11]:
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [12]:
sorted_data=filtered_data.sort_values("ProductId",axis=0,ascending=True)


In [13]:
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep="first",inplace=False)

In [14]:
final.shape

(4986, 10)

In [15]:
# To know how much percent of data is retained after eliminating duplicates
(final["Id"].size*1.0/filtered_data["Id"].size*1.0)*100

99.72

In [16]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [17]:
(final["Id"].size*1.0/filtered_data["Id"].size*1.0)*100

99.72

In [18]:
final.shape

(4986, 10)

In [19]:
# To know number positive and negative class labels
final["Score"].value_counts()

positive    4178
negative     808
Name: Score, dtype: int64

In [20]:
#Bag Of Words model
count_vect=CountVectorizer()
final_counts=count_vect.fit_transform(final["Text"].values)

In [21]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [22]:
final_counts.get_shape()

(4986, 13510)

In [23]:
import re
i=0
for sentence in final["Text"].values:
    if(len(re.findall("<.*?>",sentence))):
            print(i)
            print(sentence)
            break
    i+=1

0
Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.


In [24]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
stop=set(stopwords.words("english"))
sno=nltk.stem.SnowballStemmer("english")
def cleanhtml(sentence):
    cleanr=re.compile("<.*?>")
    cleantext=re.sub(cleanr," ",sentence)
    return cleantext
def cleanpunc(sentence):
    cleaned=re.sub(r'[?|!|\|"|#]',r" ",sentence)
    cleaned=re.sub(r'[.|,|)|(|\|/]',r" ",cleaned)
    return cleaned
print(stop)
print("*****")
print(sno.stem("tasty"))

{'t', 'both', 'being', 'over', 'only', 'my', 'they', 'i', 'themselves', "weren't", 'have', 'isn', "you'd", 'ourselves', 'all', 'not', 'll', 'about', 'any', 'whom', "mustn't", 'through', "she's", 'with', 'against', 'should', 'when', 'did', 'once', "needn't", "mightn't", 'at', 'so', 'was', 'needn', 'than', 'them', 's', 'into', 'for', 'which', "won't", 'couldn', 'that', 'above', 'from', 'few', 'won', 'and', 'a', 'yourself', "doesn't", 'his', "hasn't", 'how', 'who', 'been', 'do', 'ma', "shan't", 'below', 'm', 'weren', 'most', 'haven', 'himself', "aren't", 'had', 'while', 'he', 'our', 'of', "don't", 'him', 'an', 'again', 'each', 'we', "should've", 'you', 'off', 'same', "it's", 'hers', 'their', 'has', 'she', 'doesn', 'wouldn', 'because', "didn't", 'hasn', 'now', 'o', 'am', 'be', 'out', 'up', 'theirs', 'why', 've', 'before', 'very', 'on', 'can', "wasn't", 'nor', 'to', 'hadn', 'your', 'then', 'where', "you're", 'its', "isn't", 'doing', 'didn', 'is', 'me', 'don', 'until', 'yourselves', 'during'

In [25]:
#Text cleaning removing stop words and using Regular Expressions
i=0
strl=" "
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sentence in final["Text"].values:
    filtered_sentence=[]
    sentence=cleanhtml(sentence)
    for word in sentence.split():
        for cleaned_words in cleanpunc(word).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode("utf8")
                    filtered_sentence.append(s)
                    if(final["Score"].values)[i]=="positive" :
                        all_positive_words.append(s)
                    if(final["Score"].values)[i]=="negative":
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    strl=b" ".join(filtered_sentence)
    final_string.append(strl)
    i+=1

In [26]:
#Adding a new column
final["CleanedText"]=final_string

In [27]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,positive,1282953600,thirty bucks?,Why is this $[...] when the same product is av...,b'product avail www amazon com victor trap unr...
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,positive,1281052800,Flies Begone,We have used the Victor fly bait for 3 seasons...,b'use victor fli bait season beat great product'
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,positive,962236800,WOW Make your own 'slickers' !,I just received my shipment and could hardly w...,b'receiv shipment could hard wait tri product ...
1146,1245,B00002Z754,A29Z5PI9BW2PU3,Robbie,7,7,positive,961718400,Great Product,This was a really good idea and the final prod...,b'realli good idea final product outstand use ...
2942,3204,B000084DVR,A1UGDJP1ZJWVPF,"T. Moore ""thoughtful reader""",1,1,positive,1177977600,Good stuff!,I'm glad my 45lb cocker/standard poodle puppy ...,b'glad cocker standard poodl puppi love stuff ...


In [28]:
#Creating a new database file with created column
conn=sqlite3.connect("new229.sqlite")
c=conn.cursor()
conn.text_factory=str
final.to_sql("Reviews",conn)

In [29]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most Common Positive Words:\n",freq_dist_positive.most_common(20))

print("\nMost Common Negative Words:\n",freq_dist_negative.most_common(20))


Most Common Positive Words:
 [(b'like', 1812), (b'tast', 1635), (b'good', 1571), (b'flavor', 1547), (b'love', 1469), (b'great', 1442), (b'use', 1269), (b'product', 1202), (b'one', 1187), (b'tri', 1161), (b'coffe', 1019), (b'food', 1016), (b'chip', 995), (b'make', 982), (b'get', 830), (b'tea', 800), (b'bag', 760), (b'buy', 728), (b'best', 708), (b'eat', 708)]

Most Common Negative Words:
 [(b'like', 444), (b'tast', 431), (b'product', 399), (b'tri', 282), (b'one', 281), (b'flavor', 271), (b'would', 247), (b'food', 241), (b'use', 231), (b'good', 207), (b'buy', 187), (b'order', 185), (b'tea', 181), (b'chip', 180), (b'bag', 179), (b'get', 179), (b'even', 169), (b'make', 162), (b'box', 161), (b'mix', 155)]


In [30]:
#Bi-gram Model
count_vect=CountVectorizer(ngram_range=(1,2))
final_bigram_counts=count_vect.fit_transform(final["Text"].values)

In [31]:
final_bigram_counts.shape

(4986, 148211)

In [32]:
# TF-IDF Process with Bi-grams
tf_idf_vect=TfidfVectorizer(ngram_range=(1,2))
final_tf_idf=tf_idf_vect.fit_transform(final["Text"].values)


In [33]:
final_tf_idf.shape

(4986, 148211)

In [34]:
features=tf_idf_vect.get_feature_names()
len(features)

148211

In [35]:
features[500:510]

['150mg',
 '150mg of',
 '15g',
 '15g br',
 '15g carbs',
 '15lb',
 '15lb bichon',
 '15lbs',
 '15lbs and',
 '15mg']

In [36]:
print(final_tf_idf[3,:].toarray()[0]) # Converting a row in sparse matrix to an Array

[0. 0. 0. ... 0. 0. 0.]


In [37]:
def top_tfidf_feats(row,features,top_n=25):
    topn_ids=np.argsort(row)[::-1][:top_n]
    top_feats=[(features[i],row[i]) for i in topn_ids]
    df=pd.DataFrame(top_feats)
    df.columns=["feature","tfidf"]
    return df
# This function gets top tfidf values in row and return them with their corresponding ranks

In [38]:
top_tfidf=top_tfidf_feats(final_tf_idf[1,:].toarray()[0],features,25)
top_tfidf

Unnamed: 0,feature,tfidf
0,fly bait,0.274736
1,seasons can,0.274736
2,for seasons,0.274736
3,victor,0.262108
4,bait for,0.262108
5,the victor,0.262108
6,victor fly,0.262108
7,fly,0.246199
8,seasons,0.246199
9,bait,0.240521


In [39]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [40]:
# Using Word2Vec
# Training your own Word2Vec model using your own Text Corpus
import gensim
i=0
list_of_sent=[]
for sent in final["Text"].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [41]:
print(final["Text"].values[0])
print("********")
print(list_of_sent[0])

Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby.
********
['why', 'is', 'this', 'when', 'the', 'same', 'product', 'is', 'available', 'for', 'here', 'www', 'amazon', 'com', 'dp', 'the', 'victor', 'and', 'traps', 'are', 'unreal', 'of', 'course', 'total', 'fly', 'genocide', 'pretty', 'stinky', 'but', 'only', 'right', 'nearby']


In [42]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,vector_size=50)

In [43]:
print(w2v_model)

Word2Vec(vocab=3874, vector_size=50, alpha=0.025)


In [44]:
w2v_model.wv.most_similar("tasty")

[('delicious', 0.9566338658332825),
 ('flavorful', 0.9189849495887756),
 ('light', 0.9026752710342407),
 ('crunchy', 0.8865042924880981),
 ('crisp', 0.8786147832870483),
 ('very', 0.8727086186408997),
 ('pretty', 0.8695505857467651),
 ('fluffy', 0.8675851821899414),
 ('soft', 0.8597180247306824),
 ('fresh', 0.8578742146492004)]

In [45]:
w2v_model.wv.most_similar("good")

[('tasty', 0.840283215045929),
 ('delicious', 0.836420476436615),
 ('pretty', 0.8193181753158569),
 ('great', 0.8173674941062927),
 ('strong', 0.8032594323158264),
 ('quite', 0.7855674624443054),
 ('wonderful', 0.7843709588050842),
 ('very', 0.7778488993644714),
 ('bad', 0.7762178182601929),
 ('flavorful', 0.7674366235733032)]

In [46]:
final


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,positive,1282953600,thirty bucks?,Why is this $[...] when the same product is av...,b'product avail www amazon com victor trap unr...
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,positive,1281052800,Flies Begone,We have used the Victor fly bait for 3 seasons...,b'use victor fli bait season beat great product'
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,positive,962236800,WOW Make your own 'slickers' !,I just received my shipment and could hardly w...,b'receiv shipment could hard wait tri product ...
1146,1245,B00002Z754,A29Z5PI9BW2PU3,Robbie,7,7,positive,961718400,Great Product,This was a really good idea and the final prod...,b'realli good idea final product outstand use ...
2942,3204,B000084DVR,A1UGDJP1ZJWVPF,"T. Moore ""thoughtful reader""",1,1,positive,1177977600,Good stuff!,I'm glad my 45lb cocker/standard poodle puppy ...,b'glad cocker standard poodl puppi love stuff ...
...,...,...,...,...,...,...,...,...,...,...,...
711,765,B009HINRX8,A1OEL4UZT3KKI4,"coffee drinker in PA ""coffee drinker in PA""",0,0,positive,1344988800,great coffee - terrible price,"This is one of the best choices, in my opinion...",b'one best choic opinion also ador amazon need...
710,764,B009HINRX8,ADDBLG0CFY9AI,S.A.D.,1,1,positive,1326758400,Best of the Tassimo's,We've tried many Tassimo flavors. This is by ...,b'tri mani tassimo flavor far favorit normal c...
709,763,B009HINRX8,A3N9477PUE6WMR,patc477,4,4,positive,1323302400,Good Tasting cup o' joe,This is a bold blend that has a great taste. T...,b'bold blend great tast flavor come burst usua...
713,768,B009HINRX8,A2CAZG1CQ8BQI5,Patricia J. Nohalty,0,0,positive,1337212800,Kona for Tassimo,Of all the coffee's available for Tassimo this...,b'avail tassimo kona richest flavor fantast ar...


In [47]:
f1=final
f1.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,positive,1282953600,thirty bucks?,Why is this $[...] when the same product is av...,b'product avail www amazon com victor trap unr...
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,positive,1281052800,Flies Begone,We have used the Victor fly bait for 3 seasons...,b'use victor fli bait season beat great product'
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,positive,962236800,WOW Make your own 'slickers' !,I just received my shipment and could hardly w...,b'receiv shipment could hard wait tri product ...
1146,1245,B00002Z754,A29Z5PI9BW2PU3,Robbie,7,7,positive,961718400,Great Product,This was a really good idea and the final prod...,b'realli good idea final product outstand use ...
2942,3204,B000084DVR,A1UGDJP1ZJWVPF,"T. Moore ""thoughtful reader""",1,1,positive,1177977600,Good stuff!,I'm glad my 45lb cocker/standard poodle puppy ...,b'glad cocker standard poodl puppi love stuff ...


In [48]:
del f1["Text"]

In [49]:
del f1["Summary"]

In [50]:
f1.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,CleanedText
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,positive,1282953600,b'product avail www amazon com victor trap unr...
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,positive,1281052800,b'use victor fli bait season beat great product'
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,positive,962236800,b'receiv shipment could hard wait tri product ...
1146,1245,B00002Z754,A29Z5PI9BW2PU3,Robbie,7,7,positive,961718400,b'realli good idea final product outstand use ...
2942,3204,B000084DVR,A1UGDJP1ZJWVPF,"T. Moore ""thoughtful reader""",1,1,positive,1177977600,b'glad cocker standard poodl puppi love stuff ...


In [51]:
f1=final[["Id","ProductId","UserId","ProfileName","HelpfulnessNumerator","HelpfulnessDenominator","Time","CleanedText","Score"]]

In [52]:
f1.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Time,CleanedText,Score
2546,2774,B00002NCJC,A196AJHU9EASJN,Alex Chaffee,0,0,1282953600,b'product avail www amazon com victor trap unr...,positive
2547,2775,B00002NCJC,A13RRPGE79XFFH,reader48,0,0,1281052800,b'use victor fli bait season beat great product',positive
1145,1244,B00002Z754,A3B8RCEI0FXFI6,B G Chase,10,10,962236800,b'receiv shipment could hard wait tri product ...,positive
1146,1245,B00002Z754,A29Z5PI9BW2PU3,Robbie,7,7,961718400,b'realli good idea final product outstand use ...,positive
2942,3204,B000084DVR,A1UGDJP1ZJWVPF,"T. Moore ""thoughtful reader""",1,1,1177977600,b'glad cocker standard poodl puppi love stuff ...,positive


In [53]:
x=f1.iloc[:,-2].values

In [54]:

print(x)

[b'product avail www amazon com victor trap unreal cours total fli genocid pretti stinki right nearbi'
 b'use victor fli bait season beat great product'
 b'receiv shipment could hard wait tri product love call instead sticker remov easili daughter design sign print revers use car window print beauti print program go lot fun product window everywher surfac like screen comput monitor'
 ...
 b'bold blend great tast flavor come burst usual brew drink organ sumatra mandel use blend exclus get cup rival complex flavor tassimo brewer fantast come amazon add subscript servic'
 b'avail tassimo kona richest flavor fantast aroma far favorit'
 b'coffe suppos premium tast thin good mayb old sure wast use line bottom sit shoe trash can luggag absorb smell use drink buy']


In [55]:
y=f1.iloc[:,-1].values
y

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'negative'], dtype=object)

In [56]:
#Converting the Target variable into class of 0's and 1's
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
print(y)

[1 1 1 ... 1 1 0]


In [57]:
x=tf_idf_vect.fit_transform(x)
print(type(x))

<class 'scipy.sparse.csr.csr_matrix'>


In [58]:
#Training and Testing the data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [59]:
#Using Classification Technique for Classifying the text
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver="lbfgs")
lr.fit(x_train,y_train)

LogisticRegression()

In [60]:
y_pred=lr.predict(x_train)

In [61]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_train))

0.8462155656592671


In [62]:
y1_pred=lr.predict(x_test)

In [63]:
print(accuracy_score(y1_pred,y_test))

0.8404170008019246
