In [1]:
#essentials
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
con = sqlite3.connect('database.sqlite')
raw = pd.read_sql_query("select ProductId,ProfileName, HelpfulnessNumerator, HelpfulnessDenominator, Time, Text, case when Score >= 4 then 1 else 0 end Sentiment from Reviews", con)
con.close()

In [3]:
raw

Unnamed: 0,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Time,Text,Sentiment
0,B001E4KFG0,delmartian,1,1,1303862400,I have bought several of the Vitality canned d...,1
1,B00813GRG4,dll pa,0,0,1346976000,Product arrived labeled as Jumbo Salted Peanut...,0
2,B000LQOCH0,"Natalia Corres ""Natalia Corres""",1,1,1219017600,This is a confection that has been around a fe...,1
3,B000UA0QIQ,Karl,3,3,1307923200,If you are looking for the secret ingredient i...,0
4,B006K2ZZ7K,"Michael D. Bigham ""M. Wassir""",0,0,1350777600,Great taffy at a great price. There was a wid...,1
...,...,...,...,...,...,...,...
568449,B001EO7N10,Lettie D. Carter,0,0,1299628800,Great for sesame chicken..this is a good if no...,1
568450,B003S1WTCU,R. Sawyer,0,0,1331251200,I'm disappointed with the flavor. The chocolat...,0
568451,B004I613EE,"pksd ""pk_007""",2,2,1329782400,"These stars are small, so you can give 10-15 o...",1
568452,B004I613EE,"Kathy A. Welch ""katwel""",1,1,1331596800,These are the BEST treats for training and rew...,1


# Data Preprocessing

We understand from the previous EDA that we need to handle clean ProfileName and add 3 new variables(user review frequency, product review frequency, spam frequency). This all will be done within the preprocessing step.

We first split the data into 3 parts before preprocessing them. We split them into train data, test data, and validation data.

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xsplit, ytrain, ysplit = train_test_split(raw.drop('Sentiment',axis = 1),
                                                  raw.Sentiment, 
                                                  test_size = 0.3, 
                                                  random_state = 42)
xtest, xval, ytest, yval = train_test_split(xsplit,ysplit,
                                            test_size = 0.5,
                                            random_state = 42)

In [5]:
from gensim.models.word2vec import Word2Vec
#install python-Levenshtein too if you haven't already did

#===========================================================================#
#                         TOKENIZATION FUNCTION                             #
#===========================================================================#

def tokenize(txt):
    import re
    from wordcloud import STOPWORDS
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.stem.wordnet import WordNetLemmatizer
    stpwrd = set(STOPWORDS)
    #we then add frequent irrelevant words discovered before 
    stpwrd.update(['br','href','amazon','product','one','find','taste','flavor','good','buy','make','coffee'])
    removeapos = txt.lower().replace("'",'') #remove any apostrophe
    text = re.sub(r"[^a-zA-Z0-9\s]"," ", removeapos) #sub with space for any weird char
    words = word_tokenize(text)
    lemma = [WordNetLemmatizer().lemmatize(w) for w in words if w not in stpwrd]
    return lemma



#===========================================================================#
#                         PREPROCESS AND ENGINEER                           #
#===========================================================================#

def preprocess_engineer(data,train = True):
    #this is to handle missing values before using the data
    data['ProfileName'] = data['ProfileName'].apply(lambda x: 'Anonymous' if 
                                                (x == 'nan')|(x == 'NaN')|
                                                (x == 'N/A')|(x == '0')|
                                                (x == '')|(x == '-1')|
                                                (x == 'null')|(x == 'Null')|
                                                (x == 'NA')|(x == 'na')|
                                                (x == 'none')|(x == 'unknown') else x)
    #this is to simulate that after data train, we face a number of new reviews
    if train == True:
        countprof = data.ProfileName.value_counts().reset_index()
        countprof.columns = ['ProfileName','ProfReviewCount']
        
        countprod = data.ProductId.value_counts().reset_index()
        countprod.columns = ['ProductId','ProdReviewCount']
        
        countspam = data.groupby(['ProductId','ProfileName']).size().reset_index()
        countspam.columns = ['ProductId','ProfileName','SpamReviewCount']
        
        engineered = pd.merge(
                        pd.merge(
                            pd.merge(data,countprof,how='left',on='ProfileName'),
                            countprod,how = 'left', on = 'ProductId'),
                        countspam, how = 'left',on = ['ProductId','ProfileName'])
        return engineered      
    if train == False:
        #this is from train
        countproftrain = xtrain.ProfileName.value_counts().reset_index()
        countproftrain.columns = ['ProfileName','ProfReviewCounttrain']
        
        countprodtrain = xtrain.ProductId.value_counts().reset_index()
        countprodtrain.columns = ['ProductId','ProdReviewCounttrain']
        
        countspamtrain = xtrain.groupby(['ProductId','ProfileName']).size().reset_index()
        countspamtrain.columns = ['ProductId','ProfileName','SpamReviewCounttrain']
        
        
        
        #this is from the newly introduced data
        countprof = data.ProfileName.value_counts().reset_index()
        countprof.columns = ['ProfileName','ProfReviewCountadd']
        
        countprod = data.ProductId.value_counts().reset_index()
        countprod.columns = ['ProductId','ProdReviewCountadd']
        
        countspam = data.groupby(['ProductId','ProfileName']).size().reset_index()
        countspam.columns = ['ProductId','ProfileName','SpamReviewCountadd']
        
        
        
        #this is to add newly introduced data with train
        #ProfileName
        countproffinal = countprof.merge(countproftrain,how = 'left',on = 'ProfileName')
        countproffinal.fillna(0,inplace = True)
        countproffinal['ProfReviewCount'] = countproffinal.ProfReviewCounttrain + countproffinal.ProfReviewCountadd
        countproffinal.drop(['ProfReviewCounttrain','ProfReviewCountadd'],axis = 1,inplace = True)
    
        #ProductId
        countprodfinal = countprod.merge(countprodtrain,how = 'left',on = 'ProductId')
        countprodfinal.fillna(0,inplace = True)
        countprodfinal['ProdReviewCount'] = countprodfinal.ProdReviewCounttrain + countprodfinal.ProdReviewCountadd
        countprodfinal.drop(['ProdReviewCounttrain','ProdReviewCountadd'],axis = 1,inplace = True)
        
        #SpamReviewCount
        countspamfinal = countspam.merge(countspamtrain,how = 'left',on = ['ProductId','ProfileName'])
        countspamfinal.fillna(0,inplace = True)
        countspamfinal['SpamReviewCount'] = countspamfinal.SpamReviewCounttrain + countspamfinal.SpamReviewCountadd
        countspamfinal.drop(['SpamReviewCounttrain','SpamReviewCountadd'],axis = 1,inplace = True)
        
        engineered = pd.merge(
                        pd.merge(
                            pd.merge(data,countproffinal,how='left',on='ProfileName'),
                            countprodfinal,how = 'left', on = 'ProductId'),
                        countspamfinal, how = 'left',on = ['ProductId','ProfileName'])
        return engineered  
    

In [6]:
xtrainpreprocessed = preprocess_engineer(xtrain,train = True)
xtestpreprocessed = preprocess_engineer(xtest,train = False)
xvalpreprocessed = preprocess_engineer(xval,train = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ProfileName'] = data['ProfileName'].apply(lambda x: 'Anonymous' if


In [7]:
#function to average
def AverageVectors(wordvectmod,oritokenizedtxt):
    todict = dict(zip(wordvectmod.wv.index_to_key,wordvectmod.wv.vectors))
    #if a text is empty we should return a vector of zeros
    #with the same dimensionality as all the other vectors
    dim = len(next(iter(todict.values())))
    return np.array([np.mean([todict[w] for w in words if w in todict]
                              or 
                             [np.zeros(dim)],axis = 0)
                     for words in oritokenizedtxt
                    ])

w2v = Word2Vec(xtrainpreprocessed.Text.apply(tokenize),
                 sg = 0,
                 hs = 1,
                 seed = 42,
                 vector_size = 128)
#convert to vect average
xtrainvect = AverageVectors(w2v,xtrainpreprocessed.Text.apply(tokenize))
xtestvect = AverageVectors(w2v,xtestpreprocessed.Text.apply(tokenize))
xvalvect = AverageVectors(w2v,xvalpreprocessed.Text.apply(tokenize))


In [8]:
#add the remaining features
def vecttodata(aftervec,beforevec):
    df = pd.DataFrame(aftervec)
    df['HelpfulnessNumerator'] = beforevec.reset_index(drop=True)['HelpfulnessNumerator']
    df['HelpfulnessDenominator'] = beforevec.reset_index(drop=True)['HelpfulnessNumerator']
    df['Time'] = beforevec.reset_index(drop=True)['Time']
    df['ProfReviewCount'] = beforevec.reset_index(drop=True)['ProfReviewCount']
    df['ProdReviewCount'] = beforevec.reset_index(drop=True)['ProdReviewCount']
    df['SpamReviewCount'] = beforevec.reset_index(drop=True)['SpamReviewCount'] 
    
    return df
    
xtraindf = vecttodata(xtrainvect,xtrainpreprocessed)
xtestdf = vecttodata(xtestvect,xtestpreprocessed)
xvaldf = vecttodata(xvalvect,xvalpreprocessed)


In [9]:
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight


sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=ytrain
)

xgb = XGBClassifier(booster = 'gbtree',
                    objective = 'binary:logistic',
                    eval_metric='auc',
                    seed = 42, use_label_encoder = False,
                    num_parallel_tree = 10,
                    n_estimators = 50,
                    verbosity = 2)
xgb.fit(xtraindf,ytrain,sample_weight = sample_weights)
pred = xgb.predict(xtestdf)
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(pred,ytest, target_names = ['Negative','Positive']))

[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:18:20] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 

[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:26:31] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 

[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:34:26] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 

[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=6
[15:42:19] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 124 extra nodes, 0 

[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:50:34] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 

[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 pruned nodes, max_depth=6
[15:58:44] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 126 extra nodes, 0 

[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=6
[16:07:21] INFO: ..\src\tree\updater_prune.cc:101: tree pruning end, 122 extra nodes, 0 

In [10]:
#now to test against purely another unseen data

finalpred = xgb.predict(xvaldf)
print(classification_report(finalpred,yval, target_names = ['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.83      0.59      0.69     26183
    Positive       0.84      0.95      0.89     59086

    accuracy                           0.84     85269
   macro avg       0.84      0.77      0.79     85269
weighted avg       0.84      0.84      0.83     85269



In [11]:
ytrain.value_counts()/len(ytrain)

1    0.779982
0    0.220018
Name: Sentiment, dtype: float64

In [12]:
w2v.wv.most_similar(positive = ['great'])

[('fantastic', 0.7873159646987915),
 ('awesome', 0.7808070182800293),
 ('excellent', 0.7499616742134094),
 ('terrific', 0.7298294901847839),
 ('wonderful', 0.7185574769973755),
 ('perfect', 0.6669865846633911),
 ('nice', 0.666668176651001),
 ('fabulous', 0.6406859755516052),
 ('amazing', 0.6298671364784241),
 ('outstanding', 0.5465767979621887)]

In [13]:
w2v.wv.similarity('great','awesome')

0.780807

In [15]:
#cosine similarity manual
v1 = w2v.wv['great']
v2 = w2v.wv['awesome']
np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

0.7808069

In [16]:
np.corrcoef(v1,v2)

array([[1.        , 0.77698464],
       [0.77698464, 1.        ]])

In [17]:
coba = xtrainpreprocessed.Text.apply(tokenize)

In [18]:
coba

0         [found, store, discontinued, great, alternativ...
1         [tried, 2, 3, alternative, product, senseo, ma...
2         [let, way, regularly, pop, whole, habanero, pe...
3         [absolutely, love, bbq, popchips, favorite, sa...
4         [bar, 5, box, time, feed, 1, 3, year, old, nig...
                                ...                        
397912    [gourmet, one, fair, havent, tried, plain, rec...
397913    [bought, kid, eating, really, unlike, regular,...
397914    [eight, oclock, make, great, balance, decaf, r...
397915    [already, liked, regular, stash, earl, grey, d...
397916    [5, 7, dog, given, time, sometimes, feeding, m...
Name: Text, Length: 397917, dtype: object

In [19]:
xtrainvect.shape

(397917, 128)

In [24]:
w2v.wv.index_to_key

['great',
 'love',
 'food',
 'tea',
 'will',
 'dog',
 'really',
 'time',
 'dont',
 'much',
 'cup',
 'bag',
 'use',
 'little',
 'price',
 'best',
 'tried',
 'im',
 'well',
 'even',
 'ive',
 'better',
 'try',
 'now',
 'day',
 'eat',
 'store',
 'treat',
 'box',
 'cat',
 'chocolate',
 'water',
 'first',
 'drink',
 '2',
 'year',
 'sugar',
 'brand',
 'go',
 'used',
 'sweet',
 'found',
 'way',
 'made',
 'think',
 'give',
 'free',
 'bought',
 '1',
 'thing',
 'two',
 '3',
 'bit',
 'say',
 'order',
 'know',
 'taste',
 'still',
 'favorite',
 'ingredient',
 'mix',
 'lot',
 'chip',
 'snack',
 'got',
 '5',
 'recommend',
 'want',
 'work',
 'bar',
 'many',
 'pack',
 'nice',
 'delicious',
 'add',
 'flavor',
 'cant',
 'never',
 'need',
 'keep',
 'every',
 'something',
 'hot',
 'come',
 'stuff',
 '4',
 'review',
 'didnt',
 'organic',
 'without',
 'always',
 'quality',
 'make',
 'right',
 'healthy',
 'le',
 'milk',
 'take',
 'doesnt',
 'different',
 'enough',
 'back',
 'package',
 'small',
 'old',
 'oil',

In [None]:
testnih = "I'm gonna go to the mall. You're coming?"
testnih.replace("'",'')