In [1]:
import string
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
from tqdm import tqdm
import sqlite3
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
# nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [2]:
conn = sqlite3.connect('../datafiles/amazon_reviews.sqlite')
data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score!=3""",conn)

### Data Cleansing

In [3]:
def scr(s):
    if(s>3):
        return 'positive'
    else:
        return 'negative'

In [4]:
data['Score']

0         5
1         1
2         4
3         2
4         5
         ..
525809    5
525810    2
525811    5
525812    5
525813    5
Name: Score, Length: 525814, dtype: int64

In [5]:
data['Score'] = data['Score'].apply(scr)

In [6]:
data.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [7]:
cus_data = data.drop_duplicates(subset={'UserId','ProfileName', 'Time', 'Text'},keep='first')
cus_data = cus_data[cus_data['HelpfulnessNumerator']<=cus_data['HelpfulnessDenominator']]
cus_data.shape

(364171, 10)

In [8]:
cus_data['Score'].value_counts()

positive    307061
negative     57110
Name: Score, dtype: int64

In [9]:
cus_data.iloc[1500]['Text']

'Aboulutely love Popchips!I first tried these healthy chips at a marathon i did in California. I like this variety pack because i got to try alot of the flavors ive never had.'

In [10]:
cus_data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
525809,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,positive,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
525810,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,negative,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
525811,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,positive,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
525812,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,positive,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


>>>### Text Preprosessing

- Begin by removing the html tags
- Remove any punctuations or limited set of special characters like , or . or # etc.
- Check if the word is made up of english letters and is not alpha-numeric
- Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
- Convert the word to lowercase
- Remove Stopwords
- Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)

After which we collect the words used to describe positive and negative reviews

>- ## cleaning html tags

In [None]:
def cln_html(sen):
    clnd = re.sub(r'<.*?>',r' ',sen)
    return clnd
def cln_punc(sen):
    clnd = re.sub(r'[?|!|\'|"|#]',r'',sen)
    clnd = re.sub(r'[.|,|)|(|\|/]',r' ',clnd)
    return clnd    
stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english') 

## Do not run the code below as it takes a long time to run thorugh 36k sentences,
the required data is stored in the sql file, whenever wanted use that sql file to obtain data. 

In [None]:
i=0
str1=' '
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sent in cus_data['Text'].values:
    filtered_sentence=[]
    sent=cln_html(sent)
    for w in sent.split():
        for cleaned_words in cln_punc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (cus_data['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s)
                    if(cus_data['Score'].values)[i] == 'negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue 
    str1 = b" ".join(filtered_sentence)
    final_string.append(str1)
    i+=1

In [None]:
print(len(final_string))
final_string[1]

In [None]:
cus_data['CleanedText']=final_string

- Storing in SQL file

In [None]:
conn = sqlite3.connect('../datafiles/cus_data.sqlite')
c=conn.cursor()
conn.text_factory = str
cus_data.to_sql('Reviews', conn, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

- Checking the stored data

In [None]:
conn = sqlite3.connect('../datafiles/cus_data.sqlite')
temp_data = pd.read_sql_query(""" SELECT * FROM Reviews""",conn)

In [None]:
print(temp_data.shape)
temp_data

- Storing the cleaned sentences inn pickle 

In [None]:
with open('../datafiles/pickles/affr_clnd_sentences','wb') as affr_clnd_sentences:
    pickle.dump(cus_data,affr_clnd_sentences)
    pickle.dump(all_positive_words,affr_clnd_sentences)
    pickle.dump(all_negative_words,affr_clnd_sentences)
    affr_clnd_sentences.close()

- Retriving the data

In [None]:
with open('../datafiles/pickles/affr_clnd_sentences','rb') as temp:
    temp_cnldsentences = pickle.load(temp)
    temp_pos =pickle.load(temp)
temp_pos[5:15]

- Checking the data

In [None]:
tempo = (temp_cnldsentences == cus_data)
tempo.sum()

- ## Bag of words

In [None]:
count_vect = CountVectorizer() #in scikit-learn
bow = count_vect.fit_transform(cus_data['Text'].values) # .values has been depretiated use to_numpy()

In [15]:
count_vect_500 = CountVectorizer(max_features=500)
bow_500 = count_vect_500.fit_transform(cus_data['Text'].values)

In [None]:
print(bow.dtype,'\n',type(bow),'\n',bow.shape)

In [None]:
count_vect_feat = count_vect.get_feature_names() # list of words in the BoW
count_vect_feat.index('like')
print(count_vect_feat[64055])

- BOW pickle

In [None]:
with open('../datafiles/pickles/affr_bow','wb') as affr_bow:
    pickle.dump(bow,affr_bow)

- checking pickle

In [None]:
with open('../datafiles/pickles/affr_bow','rb') as affr_bow:
    temp = pickle.load(affr_bow)

In [None]:
temp.shape

In [16]:
with open('../datafiles/pickles/affr_bow_500','wb') as affr_bow_500:
    pickle.dump(bow_500,affr_bow_500)
with open('../datafiles/pickles/affr_bow_500','rb') as affr_bow_500:
    temp = pickle.load(affr_bow_500)

- ## n-grams

In [None]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("Most Common Positive Words : ",freq_dist_positive.most_common(20))
print("Most Common Negative Words : ",freq_dist_negative.most_common(20))

In [None]:
n_grm = CountVectorizer(ngram_range=(1,2) )
final_bigram_counts = n_grm.fit_transform(cus_data['Text'].to_numpy())

>> as you can see we put 1,2 in ngram_range hence we will get both 1 and 2 gram units. With reduce the abiguity as we see in above cells that the both the positive and negative words list contains words like 'taste' 'like' which gives wrong representation as the negative might contain 'dit not like' 'not tasty' which cannot be seen in Unigrame units, which is solved with Bigrame
But it comes at the cost of dimentionality. i.e from 115,281 to 2,910,192

In [17]:
n_grm_500 = CountVectorizer(ngram_range=(1,2),max_features=500 )
final_bigram_counts_500 = n_grm_500.fit_transform(cus_data['Text'].to_numpy())

In [None]:
final_bigram_counts

- Pickilng

In [None]:
with open('../datafiles/pickles/affr_bigram_bow','wb') as affr_bigram_bow:
    pickle.dump(final_bigram_counts,affr_bigram_bow)
with open('../datafiles/pickles/affr_bigram_bow','rb') as affr_bigram_bow:
    temp = pickle.load(affr_bigram_bow)

In [None]:
temp.shape

In [18]:
with open('../datafiles/pickles/affr_bigram_bow_500','wb') as affr_bigram_bow_500:
    pickle.dump(final_bigram_counts_500,affr_bigram_bow_500)
with open('../datafiles/pickles/affr_bigram_bow_500','rb') as affr_bigram_bow_500:
    temp = pickle.load(affr_bigram_bow_500)

- ## Tf-Idf

TfidfVectorizer is Equivalent to CountVectorizer followed by TfidfTransformer.

In [None]:
# tf_idf = TfidfVectorizer().fit_transform(cus_data['Text'])
tf_idf=TfidfVectorizer()
tf_idf_vec =  tf_idf.fit_transform(cus_data['Text'])

In [None]:
tf_idf

In [None]:
tf_idf2=TfidfVectorizer(ngram_range=(1,2))
tf_idf2_vec =  tf_idf2.fit_transform(cus_data['Text'])

In [None]:
pd.Series(tf_idf2.get_feature_names()).iloc[100000:100010]

In [None]:
features = tf_idf2.get_feature_names();
tf_idf2.get_feature_names(); #output is hidden

In [None]:
tf_idf2.get_feature_names()[100000:100010]

In [None]:
# covnert a row in saprsematrix to a numpy array
print(tf_idf2_vec[3,:].toarray()[0])
print("------------------------------------------------------")
print(tf_idf2_vec[3,:]) 
print("------------------------------------------------------")
print(tf_idf2)
print("------------------------------------------------------")
print(tf_idf2_vec)
print("------------------------------------------------------")
type(tf_idf2_vec)
print("------------------------------------------------------")
tf_idf2_vec.shape

In [None]:
# source: https://buhrmann.github.io/tfidf-analysis.html
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n] # argsort returns the arguments of assendingly arranged data..[::1]->reverses the array.
[:top_n]-> takes only top 25 
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

top_tfidf = top_tfidf_feats(tf_idf2_vec[1,:].toarray()[0],features,25)
top_tfidf

- Pickling

In [None]:
with open('../datafiles/pickles/affr_unigram_tfidf','wb') as affr_unigram_tfidf:
    pickle.dump(tf_idf,affr_unigram_tfidf)
with open('../datafiles/pickles/affr_unigram_tfidf','rb') as affr_unigram_tfidf:
    temp = pickle.load(affr_unigram_tfidf)
temp.shape

In [None]:
with open('../datafiles/pickles/affr_ngram_tfidf','wb') as affr_ngram_tfidf:
    pickle.dump(tf_idf2_vec,affr_ngram_tfidf)
with open('../datafiles/pickles/affr_ngram_tfidf','rb') as affr_ngram_tfidf:
    temp = pickle.load(affr_ngram_tfidf)
temp.shape

In [19]:
tf_idf2_500=TfidfVectorizer(ngram_range=(1,2),max_features=500)
tf_idf2_vec_500 =  tf_idf2_500.fit_transform(cus_data['Text'])

In [20]:
with open('../datafiles/pickles/affr_ngram_tfidf_500','wb') as affr_ngram_tfidf_500:
    pickle.dump(tf_idf2_vec_500,affr_ngram_tfidf_500)
with open('../datafiles/pickles/affr_ngram_tfidf_500','rb') as affr_ngram_tfidf_500:
    temp = pickle.load(affr_ngram_tfidf_500)

- # word2Vec 

from gensim.models import Word2Vec

from gensim.models import KeyedVectors

import pickle

- ### in this project we are using a pretrained model by google

- ### its 3.3G file, once you load this into your memory 

- ### it occupies ~9Gb, so please do this step only if you have >12G of ram

- ### we will provide a pickle file wich contains a dict , 

- ### and it contains all our courpus words as keys and  model[word] as values

- ### To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 

- ### from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

- ### it's 1.9GB in size.

model = KeyedVectors.load_word2vec_format('../datafiles/GoogleNews-vectors-negative300.bin', binary=True)

model.wv['computer']

model.wv.similarity('woman', 'man')

model.wv.most_similar('woman')

model.wv.most_similar('tasti')  # "tasti" is the stemmed word for tasty, tastful

model.wv.most_similar('tasty') 

model.wv.similarity('tasty', 'tast')

In [None]:
#if you do NOT have RAM >= 12GB, use the code below.
# But we dont have this pickle, when you will get it, then we will be able to run th model.

# with open('word2vec_model', 'rb') as handle:
#     model = pickle.load(handle)

### These are word list of sentences without stop words or stemming

In [None]:
# Train your own Word2Vec model using your own text corpus

list_of_sent=[]
for sent in cus_data['Text'].values:
    filtered_sentence=[]
    sent=cln_html(sent)
    for w in sent.split():
        for cleaned_words in cln_punc(w).split():
            if(cleaned_words.isalpha()):    
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [None]:
len(list_of_sent)

In [None]:
with open('../datafiles/pickles/unstoppedunstemmed_word_list_sent','wb') as unstoppedunstemmed_word_list_sent:
    pickle.dump(list_of_sent,unstoppedunstemmed_word_list_sent)
with open('../datafiles/pickles/unstoppedunstemmed_word_list_sent','rb') as unstoppedunstemmed_word_list_sent:
    temp = pickle.load(unstoppedunstemmed_word_list_sent)
len(temp)

In [None]:
print(cus_data['Text'].values[0])
print("*****************************************************************")
print(list_of_sent[0])

## Training our W2v model with list_of_sent

In [None]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=4)

- Saving the trained model

In [None]:
from gensim.test.utils import get_tmpfile
path = get_tmpfile("../datafiles/pickles/w2v_model.model")
w2v_model.save("../datafiles/pickles/w2v_model.model")

In [None]:
# temp_w2v_model = gensim.models.Word2Vec.load("../datafiles/pickles/w2v_model.model")
w2v_model = gensim.models.Word2Vec.load("../datafiles/pickles/w2v_model.model")

- Checking/Comparing the models

In [None]:
words = list(temp_w2v_model.wv.vocab)
print(len(words))
words = list(w2v_model.wv.vocab)
print(len(words))

OBSERVATION : We have 33783 unique words

In [None]:
w2v_model.wv.most_similar('tasty')

In [None]:
temp_w2v_model.wv.most_similar('tasty')

- The code below will not work as the amount of data is tooooo large for pickle to handle

In [None]:
# with open('../datafiles/pickles/clnd_sentences_w2v','wb') as clnd_sentences_for_w2v:
#     pickle.dump(list_of_sent[:200000],clnd_sentences_for_w2v)
# with open('../datafiles/pickles/clnd_sentences_w2v','rb') as clnd_sentences_for_w2v:
#     temp = pickle.load(clnd_sentences_for_w2v)
# len(temp)

- So insted of pickle we use Joblib from sklearn

In [None]:
import joblib
joblib.dump(list_of_sent,"../datafiles/pickles/clnd_sentences_for_w2v.joblib") 

In [None]:
 temp = joblib.load('../datafiles/pickles/clnd_sentences_for_w2v.joblib') 
 len(temp)

//////////////////////////////  some more word surfing //////////////////////////////////

In [None]:
w2v_model.wv.most_similar('like')

In [None]:
print(w2v_model.wv['like'].shape)
w2v_model.wv['like']

In [None]:
w2v_model.wv['sahil']

In [None]:
list_of_sent[0]

- ## Avg W2V

In [None]:
# average Word2Vec
# compute average word2vec for each review.
i=0
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
            i=i+1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

In [None]:
i

In [None]:
sent_vectors[:5]

- Storing in pickle

In [None]:
with open('../datafiles/pickles/avg_w2v','wb') as avg_w2v:
    pickle.dump(sent_vectors,avg_w2v)
with open('../datafiles/pickles/avg_w2v','rb') as avg_w2v:
    temp = pickle.load(avg_w2v)

In [None]:
len(temp)

- ## TfIdf wiighted W2V 

In [None]:
# if u dont want to execute the list_of_sent generating function
with open('../datafiles/pickles/unstoppedunstemmed_word_list_sent','rb') as unstoppedunstemmed_word_list_sent:
    list_of_sent = pickle.load(unstoppedunstemmed_word_list_sent)
# and dont want to train the w2v model again
with open('../datafiles/pickles/avg_w2v','rb') as avg_w2v:
    w2v_model = gensim.models.Word2Vec.load("../datafiles/pickles/w2v_model.model")

In [None]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf.get_feature_names() 
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sent[0:1000]): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = tf_idf_vec[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

In [None]:
with open('../datafiles/pickles/tdifd_weighted_w2v','wb') as tdifd_weighted_w2v:
    pickle.dump(tfidf_sent_vectors,tdifd_weighted_w2v)
with open('../datafiles/pickles/tdifd_weighted_w2v','rb') as tdifd_weighted_w2v:
    temp = pickle.load(tdifd_weighted_w2v)

In [None]:
len(temp),temp[2]