In [1]:
import string
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
import sqlite3
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
# nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [2]:
conn = sqlite3.connect('../datafiles/amazon_reviews.sqlite')
data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score!=3""",conn)

### Data Cleansing

In [3]:
def scr(s):
    if(s>3):
        return 'positive'
    else:
        return 'negative'

In [4]:
data['Score']

0         5
1         1
2         4
3         2
4         5
         ..
525809    5
525810    2
525811    5
525812    5
525813    5
Name: Score, Length: 525814, dtype: int64

In [5]:
data['Score'] = data['Score'].apply(scr)

In [6]:
data.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [7]:
cus_data = data.drop_duplicates(subset={'UserId','ProfileName', 'Time', 'Text'},keep='first')
cus_data = cus_data[cus_data['HelpfulnessNumerator']<=cus_data['HelpfulnessDenominator']]
cus_data.shape

(364171, 10)

In [8]:
cus_data['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

In [9]:
cus_data.iloc[1500]['Text']

'Aboulutely love Popchips!I first tried these healthy chips at a marathon i did in California. I like this variety pack because i got to try alot of the flavors ive never had.'

>>>### Text Preprosessing

- Begin by removing the html tags
- Remove any punctuations or limited set of special characters like , or . or # etc.
- Check if the word is made up of english letters and is not alpha-numeric
- Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
- Convert the word to lowercase
- Remove Stopwords
- Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)

After which we collect the words used to describe positive and negative reviews

>- ## cleaning html tags

In [10]:
def cln_html(sen):
    clnd = re.sub(r'<.*?>',r' ',sen)
    return clnd
def cln_punc(sen):
    clnd = re.sub(r'[?|!|\'|"|#]',r'',sen)
    clnd = re.sub(r'[.|,|)|(|\|/]',r' ',clnd)
    return clnd    
stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english') 

## Do not run the code below as it takes a long time to run thorugh 36k sentences,
the required data is stored in the sql file, whenever wanted use that sql file to obtain data. 

In [19]:
i=0
str1=' '
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sent in cus_data['Text'].values:
    filtered_sentence=[]
    sent=cln_html(sent)
    for w in sent.split():
        for cleaned_words in cln_punc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (cus_data['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s)
                    if(cus_data['Score'].values)[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue 
    str1 = b" ".join(filtered_sentence)
    final_string.append(str1)
    i+=1

In [20]:
print(len(final_string))
final_string[1]

364171


b'product arriv label jumbo salt peanut peanut actual small size unsalt sure error vendor intend repres product jumbo'

In [21]:
cus_data['CleanedText']=final_string

- Storing in SQL file

In [22]:
conn = sqlite3.connect('../datafiles/cus_data.sqlite')
c=conn.cursor()
conn.text_factory = str
cus_data.to_sql('Reviews', conn, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

- Checking the stored data

In [23]:
conn = sqlite3.connect('../datafiles/cus_data.sqlite')
temp_data = pd.read_sql_query(""" SELECT * FROM Reviews""",conn)

In [24]:
print(temp_data.shape)
temp_data

(364171, 12)


Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,b'bought sever vital can dog food product foun...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,b'product arriv label jumbo salt peanut peanut...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,b'confect around centuri light pillowi citrus ...
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,b'look secret ingredi robitussin believ found ...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,b'great taffi great price wide assort yummi ta...
...,...,...,...,...,...,...,...,...,...,...,...,...
364166,525809,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,positive,1299628800,Will not do without,Great for sesame chicken..this is a good if no...,b'great sesam chicken good better restur eaten...
364167,525810,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,negative,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...,b'disappoint flavor chocol note especi weak mi...
364168,525811,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,positive,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o...",b'star small give one train session tri train ...
364169,525812,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,positive,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...,b'best treat train reward dog good groom lower...


- Storing the cleaned sentences inn pickle 

In [36]:
with open('../datafiles/pickles/affr_clnd_sentences','wb') as affr_clnd_sentences:
    pickle.dump(cus_data,affr_clnd_sentences)
    pickle.dump(all_positive_words,affr_clnd_sentences)
    pickle.dump(all_negative_words,affr_clnd_sentences)
    affr_clnd_sentences.close()

- Retriving the data

In [33]:
with open('../datafiles/pickles/affr_clnd_sentences','rb') as temp:
    temp_cnldsentences = pickle.load(temp)
    temp_pos =pickle.load(temp)
temp_pos[5:15]

[b'food',
 b'product',
 b'found',
 b'good',
 b'qualiti',
 b'product',
 b'look',
 b'like',
 b'stew',
 b'process']

- Checking the data

In [30]:
tempo = (temp_cnldsentences == cus_data)
tempo.sum()

Id                        364171
ProductId                 364171
UserId                    364171
ProfileName               364171
HelpfulnessNumerator      364171
HelpfulnessDenominator    364171
Score                     364171
Time                      364171
Summary                   364171
Text                      364171
CleanedText               364171
dtype: int64

- ## Bag of words

In [46]:
count_vect = CountVectorizer() #in scikit-learn
bow = count_vect.fit_transform(cus_data['Text'].values) # .values has been depretiated use to_numpy()

In [47]:
print(bow.dtype,'\n',type(bow),'\n',bow.shape)

int64 
 <class 'scipy.sparse.csr.csr_matrix'> 
 (364171, 115281)


- BOW pickle

In [48]:
with open('../datafiles/pickles/affr_bow','wb') as affr_bow:
    pickle.dump(bow,affr_bow)

- checking pickle

In [49]:
with open('../datafiles/pickles/affr_bow','rb') as affr_bow:
    temp = pickle.load(affr_bow)

In [50]:
temp.shape

(364171, 115281)

- ## n-grams

In [32]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ",freq_dist_positive.most_common(20))
print("Most Common Negative Words : ",freq_dist_negative.most_common(20))

Most Common Positive Words :  [(b'like', 139426), (b'tast', 129045), (b'good', 112766), (b'flavor', 109628), (b'love', 107357), (b'use', 103886), (b'great', 103871), (b'one', 96723), (b'product', 91033), (b'tri', 86790), (b'tea', 83893), (b'coffe', 78813), (b'make', 75107), (b'get', 72124), (b'food', 64803), (b'would', 55566), (b'time', 55264), (b'buy', 54198), (b'realli', 52714), (b'eat', 52004)]
Most Common Negative Words :  [(b'tast', 34587), (b'like', 32333), (b'product', 28218), (b'one', 20572), (b'flavor', 19571), (b'would', 17974), (b'tri', 17754), (b'use', 15304), (b'good', 15041), (b'coffe', 14717), (b'get', 13787), (b'buy', 13752), (b'order', 12871), (b'food', 12753), (b'dont', 11877), (b'tea', 11660), (b'even', 11088), (b'box', 10843), (b'amazon', 10073), (b'make', 9840)]


In [54]:
n_grm = CountVectorizer(ngram_range=(1,2) )
final_bigram_counts = n_grm.fit_transform(cus_data['Text'].to_numpy())

>> as you can see we put 1,2 in ngram_range hence we will get both 1 and 2 gram units. With reduce the abiguity as we see in above cells that the both the positive and negative words list contains words like 'taste' 'like' which gives wrong representation as the negative might contain 'dit not like' 'not tasty' which cannot be seen in Unigrame units, which is solved with Bigrame
But it comes at the cost of dimentionality. i.e from 115,281 to 2,910,192

In [55]:
final_bigram_counts

<364171x2910192 sparse matrix of type '<class 'numpy.int64'>'
	with 45049660 stored elements in Compressed Sparse Row format>

- Pickilng

In [57]:
with open('../datafiles/pickles/affr_bigram_bow','wb') as affr_bigram_bow:
    pickle.dump(final_bigram_counts,affr_bigram_bow)
with open('../datafiles/pickles/affr_bigram_bow','rb') as affr_bigram_bow:
    temp = pickle.load(affr_bigram_bow)

In [58]:
temp.shape

(364171, 2910192)

- ## Tf-Idf

TfidfVectorizer is Equivalent to CountVectorizer followed by TfidfTransformer.

In [61]:
tf_idf = TfidfVectorizer().fit_transform(cus_data['Text'])

In [62]:
tf_idf

<364171x115281 sparse matrix of type '<class 'numpy.float64'>'
	with 19341760 stored elements in Compressed Sparse Row format>

In [63]:
tf_idf2=TfidfVectorizer(ngram_range=(1,2))
tf_idf2_vec =  tf_idf2.fit_transform(cus_data['Text'])

In [64]:
pd.Series(tf_idf2.get_feature_names()).iloc[100000:100010]

100000             ales until
100001                ales ve
100002             ales would
100003               ales you
100004             alessandra
100005    alessandra ambrosia
100006                 alessi
100007           alessi added
100008            alessi also
100009             alessi and
dtype: object

In [65]:
features = tf_idf2.get_feature_names();
tf_idf2.get_feature_names(); #output is hidden

In [66]:
tf_idf2.get_feature_names()[100000:100010]

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and']

In [68]:
# covnert a row in saprsematrix to a numpy array
print(tf_idf2_vec[3,:].toarray()[0])
print("------------------------------------------------------")
print(tf_idf2_vec[3,:]) 
print("------------------------------------------------------")
print(tf_idf2)
print("------------------------------------------------------")
print(tf_idf2_vec)
print("------------------------------------------------------")
type(tf_idf2_vec)
print("------------------------------------------------------")
tf_idf2_vec.shape

[0. 0. 0. ... 0. 0. 0.]
------------------------------------------------------
  (0, 2746741)	0.20864406459243112
  (0, 1328386)	0.07471461675685441
  (0, 966736)	0.08577072448646748
  (0, 2521977)	0.07525969862030189
  (0, 2305119)	0.16720704162458458
  (0, 519921)	0.1788777240190179
  (0, 2312964)	0.21231435167256238
  (0, 1505491)	0.1452554084765105
  (0, 151277)	0.1203337236728053
  (0, 1092035)	0.09241910984219587
  (0, 2774516)	0.10850684818597585
  (0, 2822510)	0.1119547534739983
  (0, 1805781)	0.2144686151689472
  (0, 900675)	0.23240808025553095
  (0, 305841)	0.23240808025553095
  (0, 2141485)	0.1401240393842999
  (0, 2533496)	0.16061453189794056
  (0, 74015)	0.11043777388007137
  (0, 1267174)	0.11761910694232527
  (0, 2578626)	0.10058011930205456
  (0, 1101627)	0.1133438791277944
  (0, 1336429)	0.12012212218074879
  (0, 1009523)	0.09654312355740105
  (0, 313959)	0.185494900101343
  (0, 2136821)	0.23982407761531543
  :	:
  (0, 313760)	0.09142321641094835
  (0, 2136820)	0.207052

(364171, 2910192)

In [46]:
# source: https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n] # argsort returns the arguments of assendingly arranged data..[::1]->reverses the array.
[:top_n]-> takes only top 25 
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(tf_idf2_vec[1,:].toarray()[0],features,25)
top_tfidf

Unnamed: 0,feature,tfidf
0,as jumbo,0.390489
1,jumbo,0.260971
2,unsalted not,0.201475
3,jumbo salted,0.201475
4,vendor intended,0.201475
5,sized unsalted,0.201475
6,arrived labeled,0.187395
7,peanuts,0.186777
8,actually small,0.184594
9,error or,0.176745


- Pickling

In [69]:
with open('../datafiles/pickles/affr_unigram_tfidf','wb') as affr_unigram_tfidf:
    pickle.dump(tf_idf,affr_unigram_tfidf)
with open('../datafiles/pickles/affr_unigram_tfidf','rb') as affr_unigram_tfidf:
    temp = pickle.load(affr_unigram_tfidf)
temp.shape

(364171, 115281)

In [72]:
with open('../datafiles/pickles/affr_ngram_tfidf','wb') as affr_ngram_tfidf:
    pickle.dump(tf_idf2_vec,affr_ngram_tfidf)
with open('../datafiles/pickles/affr_ngram_tfidf','rb') as affr_ngram_tfidf:
    temp = pickle.load(affr_ngram_tfidf)
temp.shape

(364171, 2910192)

- # word2Vec 

from gensim.models import Word2Vec

from gensim.models import KeyedVectors

import pickle

- ### in this project we are using a pretrained model by google

- ### its 3.3G file, once you load this into your memory 

- ### it occupies ~9Gb, so please do this step only if you have >12G of ram

- ### we will provide a pickle file wich contains a dict , 

- ### and it contains all our courpus words as keys and  model[word] as values

- ### To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 

- ### from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

- ### it's 1.9GB in size.

model = KeyedVectors.load_word2vec_format('../datafiles/GoogleNews-vectors-negative300.bin', binary=True)

model.wv['computer']

model.wv.similarity('woman', 'man')

model.wv.most_similar('woman')

model.wv.most_similar('tasti')  # "tasti" is the stemmed word for tasty, tastful

model.wv.most_similar('tasty') 

model.wv.similarity('tasty', 'tast')

In [None]:
#if you do NOT have RAM >= 12GB, use the code below.
# But we dont have this pickle, when you will get it, then we will be able to run th model.

# with open('word2vec_model', 'rb') as handle:
#     model = pickle.load(handle)

In [11]:
# Train your own Word2Vec model using your own text corpus

list_of_sent=[]
for sent in cus_data['Text'].values:
    filtered_sentence=[]
    sent=cln_html(sent)
    for w in sent.split():
        for cleaned_words in cln_punc(w).split():
            if(cleaned_words.isalpha()):    
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [12]:
print(cus_data['Text'].values[0])
print("*****************************************************************")
print(list_of_sent[0])

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
*****************************************************************
['i', 'have', 'bought', 'several', 'of', 'the', 'vitality', 'canned', 'dog', 'food', 'products', 'and', 'have', 'found', 'them', 'all', 'to', 'be', 'of', 'good', 'quality', 'the', 'product', 'looks', 'more', 'like', 'a', 'stew', 'than', 'a', 'processed', 'meat', 'and', 'it', 'smells', 'better', 'my', 'labrador', 'is', 'finicky', 'and', 'she', 'appreciates', 'this', 'product', 'better', 'than', 'most']


In [None]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=4)    


In [None]:
words = list(w2v_model.wv.vocab)
print(len(words))

In [None]:
w2v_model.wv.most_similar('tasty')

In [None]:
w2v_model.wv.most_similar('like')

In [None]:
count_vect_feat = count_vect.get_feature_names() # list of words in the BoW
count_vect_feat.index('like')
print(count_vect_feat[64055])

- ## Avg W2V

In [None]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

- ## TfIdf-W2V 

In [None]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1