In [2]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
import sqlite3

# Using SQLite to read data table

In [3]:
con = sqlite3.connect('./database.sqlite')

## Filtering Only Positive(4,5) & Negative(1,2) Reviews i.e. not including Rating = 3

In [4]:
filtered_data = pd.read_sql_query('''
SELECT * 
FROM Reviews
WHERE Score != 3 
''',con)

## Differentiating Between Positive and Negative Reviews

In [5]:
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition)
filtered_data['Score'] = positiveNegative

In [6]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [7]:
filtered_data.shape

(525814, 10)

## Data Cleaning : Deduplication

In [8]:
display = pd.read_sql_query('''
SELECT *
FROM Reviews
WHERE Score != 3 AND  UserId = "AR5J8UI46CURR"
ORDER BY ProductId
''', con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


## Sorting Data according to the ProductId in ascending Order

In [9]:
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending = True )

In [10]:
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep= 'first', inplace= False)
final.shape

(364173, 10)

## Checking the amount of data retained

In [11]:
(final['Id'].size*1.0/filtered_data['Id'].size*1.0)*100

69.25890143662969

## Helpfulness

In [12]:
display = pd.read_sql_query('''
SELECT * 
FROM Reviews 
WHERE Score != 3 AND Id = 44737 OR Id = 64422
ORDER BY ProductId
''', con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


**Here we can see that HelpfullnessNumerator(Yes) is greater than HelpfullnessDenominator(Yes+NO)**.
**Therefore, removing such rows**

In [13]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
final.shape

(364171, 10)

In [14]:
final['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

# Text Preprocessing : Stemming, Stop-words removal and Lemmatization

**Importing Libraries**

In [15]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

Preprocessing Steps
1. Removing HTML tags
2. Remove punctuations or .,*# 
3. Check if word made up of english or alpha numeric
4. Check if length of word is greater than 2
5. Convert the word to lower case
6. Remove Stop-words

In [16]:
i = 0
for sent in final['Text'].values:
    if (len(re.findall('<.*?>',sent))):
        print(i)
        print(sent)
        break
    i += 1;
        

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [17]:
stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english')

def cleanhtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ' , sentence)
    return cleantext
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'', sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ', cleaned)
    return cleaned
print(stop)
print("***************************")
print(sno.stem('tasty'))

{'once', 'which', 'about', "needn't", 'both', 'yourself', 'and', 'before', 'does', "don't", 'couldn', "hasn't", 'these', 'yours', 'because', 'again', 'if', 'its', 'there', 'herself', "didn't", 'in', "aren't", 'doing', 'of', 'our', 'why', 'will', "hadn't", 'from', 'at', 'yourselves', 'being', 'such', 'are', "weren't", 'did', "haven't", 'can', 'had', 'above', 'further', 'haven', 'o', 'over', 'very', 'hasn', "shouldn't", 'between', 'y', 'that', 'how', 're', 'just', 'aren', 'should', 'm', 'but', 'no', 'isn', 'whom', "mightn't", 'be', 'as', 'his', "won't", 'weren', 'he', 'she', 'same', "she's", 't', "that'll", 'down', 'theirs', 'than', 'this', 'am', 'all', 'is', 'my', 'i', 'now', 'a', 'those', 'their', 'into', "it's", 'they', 'don', 'who', 'during', 's', 'where', 'any', 'few', 've', 'have', "you're", 'not', "doesn't", 'having', 'while', "shan't", 'were', 'own', 'for', "you'll", 'with', 'against', 'until', 'me', 'after', 'won', 'off', 'myself', 'wouldn', "you'd", 'him', 'too', "wasn't", 'und

### Code for implementing step by step the checks mentioned in preprocessing

In [18]:
i = 0
strl = ' '
final_string = []
all_positive_words = []
all_negative_words = []
s = ''
for sent in final['Text'].values:
    filtered_sentence = []
    #print(sent);
    sent = cleanhtml(sent) #remove HTML tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
                if(cleaned_words.lower() not in stop):
                    s = (sno.stem(cleaned_words.lower()))
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive':
                        all_positive_words.append(s)
                    if (final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    #print(filtered_sentence)
    strl = " ".join(filtered_sentence) #final string of cleaned words
    #print('*******************************************')
    final_string.append(strl)
    i+=1

In [19]:
final['CleanedText'] = final_string

In [20]:
final.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read sendak book watch realli rosi movi i...
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn month year learn poem t...


In [21]:
## Store final dataframe as sqlite table for future.
conn = sqlite3.connect('final.sqlite')
c = conn.cursor()
conn.text_factory = str
final.to_sql ('Reviews' , conn, schema = None, if_exists= 'replace')

# Bag Of Words (BoW) : Uni-grams, Bi-Grams, n-grams

In [22]:
freq_dist_positive = nltk.FreqDist(all_positive_words)
freq_dist_negative = nltk.FreqDist(all_negative_words)
print("Most Common Positive words : ", freq_dist_positive.most_common(20))
print("Most Common Negative words : ", freq_dist_negative.most_common(20))

Most Common Positive words :  [('like', 139429), ('tast', 129047), ('good', 112766), ('flavor', 109624), ('love', 107357), ('use', 103888), ('great', 103870), ('one', 96726), ('product', 91033), ('tri', 86791), ('tea', 83888), ('coffe', 78814), ('make', 75107), ('get', 72125), ('food', 64802), ('would', 55568), ('time', 55264), ('buy', 54198), ('realli', 52715), ('eat', 52004)]
Most Common Negative words :  [('tast', 34585), ('like', 32330), ('product', 28218), ('one', 20569), ('flavor', 19575), ('would', 17972), ('tri', 17753), ('use', 15302), ('good', 15041), ('coffe', 14716), ('get', 13786), ('buy', 13752), ('order', 12871), ('food', 12754), ('dont', 11877), ('tea', 11665), ('even', 11085), ('box', 10844), ('amazon', 10073), ('make', 9840)]


**Uni-grams**

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
final_counts = count_vect.fit_transform(final['Text'].values)

In [24]:
type(final_counts)

scipy.sparse.csr.csr_matrix

In [25]:
final_counts.shape

(364171, 115281)

**Bi-grams**

In [26]:
count_vect = CountVectorizer(ngram_range=(1,2))
final_bigram_counts = count_vect.fit_transform(final['Text'].values)

In [27]:
final_bigram_counts.shape

(364171, 2910192)

# TF-IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(ngram_range = (1,2))
final_tfidf = tf_idf.fit_transform(final['Text'].values)

In [29]:
final_tfidf.get_shape()

(364171, 2910192)

In [30]:
features = tf_idf.get_feature_names()
len(features)



2910192

In [31]:
features[100000:100020]

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and',
 'alessi are',
 'alessi at',
 'alessi brand',
 'alessi breadsticks',
 'alessi caffe',
 'alessi cento',
 'alessi chicken',
 'alessi coarse',
 'alessi coffees',
 'alessi decaf']

**Convert a row in sparsematrix to numpy array**

In [32]:
print(final_tfidf[4,:].toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [33]:
def top_tfidf_features(row, features, top_n= 25 ):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature','tfidf']
    return df
top_tfidf = top_tfidf_features(final_tfidf[1,:].toarray()[0],features, 25)

In [34]:
top_tfidf

Unnamed: 0,feature,tfidf
0,sendak books,0.173437
1,rosie movie,0.173437
2,paperbacks seem,0.173437
3,cover version,0.173437
4,these sendak,0.173437
5,the paperbacks,0.173437
6,pages open,0.173437
7,really rosie,0.168074
8,incorporates them,0.168074
9,paperbacks,0.168074


# Word2Vec

In [35]:
!pip install gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
import numpy



In [36]:
import gensim
from scipy import stats
models = KeyedVectors.load_word2vec_format('E:/Data Science/Projects/Amazon Fine Food Reviews/GoogleNews-vectors-negative300.bin', binary=True, unicode_errors= 'ignore')

In [37]:
models['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [38]:
models.similarity('tasty','delicious')

0.873039

In [39]:
models.most_similar('man')

[('woman', 0.7664012908935547),
 ('boy', 0.6824871301651001),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.571636438369751),
 ('robber', 0.5585118532180786),
 ('Robbery_suspect', 0.5584409832954407),
 ('teen_ager', 0.5549196004867554),
 ('men', 0.5489763021469116)]

In [40]:
i = 0
list_of_sent = []
for sent in final['Text'].values:
    filtered_sentence = []
    sent = cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if (cleaned_words.isalpha()):
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)
    

In [63]:
# --NotebookApp.iopub_data_rate_limit=1.0e10
print(final['Text'].values[0])
print('**********************************************************')
print(list_of_sent)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [46]:
w2v_model = gensim.models.Word2Vec(list_of_sent, min_count = 5, workers = 2)

In [64]:
words = list(w2v_model.wv.key_to_index)
print(len(words))

33783


In [66]:
w2v_model.wv.most_similar('tasty')

[('tastey', 0.8797308802604675),
 ('yummy', 0.8160552978515625),
 ('satisfying', 0.7841655015945435),
 ('delicious', 0.7786670923233032),
 ('flavorful', 0.7761139273643494),
 ('filling', 0.7319689989089966),
 ('versatile', 0.695934534072876),
 ('tasteful', 0.6948708891868591),
 ('addicting', 0.6794335246086121),
 ('hardy', 0.6760690808296204)]

In [67]:
w2v_model.wv.most_similar('like')

[('resemble', 0.6237977147102356),
 ('dislike', 0.5969998836517334),
 ('prefer', 0.5630308389663696),
 ('alright', 0.5492289662361145),
 ('love', 0.5468196868896484),
 ('mean', 0.5444397330284119),
 ('enjoy', 0.5367627143859863),
 ('think', 0.5327551364898682),
 ('overpower', 0.5321547389030457),
 ('weird', 0.5311552882194519)]