In [1]:
#essentials
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
con = sqlite3.connect('database.sqlite')
raw = pd.read_sql_query("select ProductId,ProfileName, HelpfulnessNumerator, HelpfulnessDenominator, Time, Text, case Score when 5 then 1 when 4 then 0 else -1 end Sentiment from Reviews", con)
con.close()

In [3]:
raw

Unnamed: 0,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Time,Text,Sentiment
0,B001E4KFG0,delmartian,1,1,1303862400,I have bought several of the Vitality canned d...,1
1,B00813GRG4,dll pa,0,0,1346976000,Product arrived labeled as Jumbo Salted Peanut...,-1
2,B000LQOCH0,"Natalia Corres ""Natalia Corres""",1,1,1219017600,This is a confection that has been around a fe...,0
3,B000UA0QIQ,Karl,3,3,1307923200,If you are looking for the secret ingredient i...,-1
4,B006K2ZZ7K,"Michael D. Bigham ""M. Wassir""",0,0,1350777600,Great taffy at a great price. There was a wid...,1
...,...,...,...,...,...,...,...
568449,B001EO7N10,Lettie D. Carter,0,0,1299628800,Great for sesame chicken..this is a good if no...,1
568450,B003S1WTCU,R. Sawyer,0,0,1331251200,I'm disappointed with the flavor. The chocolat...,-1
568451,B004I613EE,"pksd ""pk_007""",2,2,1329782400,"These stars are small, so you can give 10-15 o...",1
568452,B004I613EE,"Kathy A. Welch ""katwel""",1,1,1331596800,These are the BEST treats for training and rew...,1


# Data Preprocessing

We understand from the previous EDA that we need to handle clean ProfileName and add 3 new variables(user review frequency, product review frequency, spam frequency). This all will be done within the preprocessing step.

We first split the data into 3 parts before preprocessing them. We split them into train data, test data, and validation data.

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xsplit, ytrain, ysplit = train_test_split(raw.drop('Sentiment',axis = 1),
                                                  raw.Sentiment, 
                                                  test_size = 0.3, 
                                                  random_state = 42)
xtest, xval, ytest, yval = train_test_split(xsplit,ysplit,
                                            test_size = 0.5,
                                            random_state = 42)

In [5]:
from gensim.models.word2vec import Word2Vec
#install python-Levenshtein too if you haven't already did

#===========================================================================#
#                         TOKENIZATION FUNCTION                             #
#===========================================================================#

def tokenize(txt):
    import re
    from wordcloud import STOPWORDS
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.stem.wordnet import WordNetLemmatizer
    stpwrd = set(STOPWORDS)
    #we then add frequent irrelevant words discovered before 
    stpwrd.update(['br','href','good', 'amazon', 'one', 'taste', 'make', 'flavor','product'])
    text = re.sub(r"[^a-zA-Z0-9'\s]"," ", txt.lower())
    words = word_tokenize(text)
    lemma = [WordNetLemmatizer().lemmatize(w) for w in words if w not in stpwrd]
    return lemma



#===========================================================================#
#                         PREPROCESS AND ENGINEER                           #
#===========================================================================#

def preprocess_engineer(data,train = True):
    #this is to handle missing values before using the data
    data['ProfileName'] = data['ProfileName'].apply(lambda x: 'Anonymous' if 
                                                (x == 'nan')|(x == 'NaN')|
                                                (x == 'N/A')|(x == '0')|
                                                (x == '')|(x == '-1')|
                                                (x == 'null')|(x == 'Null')|
                                                (x == 'NA')|(x == 'na')|
                                                (x == 'none')|(x == 'unknown') else x)
    #this is to simulate that after data train, we face a number of new reviews
    if train == True:
        countprof = data.ProfileName.value_counts().reset_index()
        countprof.columns = ['ProfileName','ProfReviewCount']
        
        countprod = data.ProductId.value_counts().reset_index()
        countprod.columns = ['ProductId','ProdReviewCount']
        
        countspam = data.groupby(['ProductId','ProfileName']).size().reset_index()
        countspam.columns = ['ProductId','ProfileName','SpamReviewCount']
        
        engineered = pd.merge(
                        pd.merge(
                            pd.merge(data,countprof,how='left',on='ProfileName'),
                            countprod,how = 'left', on = 'ProductId'),
                        countspam, how = 'left',on = ['ProductId','ProfileName'])
        return engineered      
    if train == False:
        #this is from train
        countproftrain = xtrain.ProfileName.value_counts().reset_index()
        countproftrain.columns = ['ProfileName','ProfReviewCounttrain']
        
        countprodtrain = xtrain.ProductId.value_counts().reset_index()
        countprodtrain.columns = ['ProductId','ProdReviewCounttrain']
        
        countspamtrain = xtrain.groupby(['ProductId','ProfileName']).size().reset_index()
        countspamtrain.columns = ['ProductId','ProfileName','SpamReviewCounttrain']
        
        
        
        #this is from the newly introduced data
        countprof = data.ProfileName.value_counts().reset_index()
        countprof.columns = ['ProfileName','ProfReviewCountadd']
        
        countprod = data.ProductId.value_counts().reset_index()
        countprod.columns = ['ProductId','ProdReviewCountadd']
        
        countspam = data.groupby(['ProductId','ProfileName']).size().reset_index()
        countspam.columns = ['ProductId','ProfileName','SpamReviewCountadd']
        
        
        
        #this is to add newly introduced data with train
        #ProfileName
        countproffinal = countprof.merge(countproftrain,how = 'left',on = 'ProfileName')
        countproffinal.fillna(0,inplace = True)
        countproffinal['ProfReviewCount'] = countproffinal.ProfReviewCounttrain + countproffinal.ProfReviewCountadd
        countproffinal.drop(['ProfReviewCounttrain','ProfReviewCountadd'],axis = 1,inplace = True)
    
        #ProductId
        countprodfinal = countprod.merge(countprodtrain,how = 'left',on = 'ProductId')
        countprodfinal.fillna(0,inplace = True)
        countprodfinal['ProdReviewCount'] = countprodfinal.ProdReviewCounttrain + countprodfinal.ProdReviewCountadd
        countprodfinal.drop(['ProdReviewCounttrain','ProdReviewCountadd'],axis = 1,inplace = True)
        
        #SpamReviewCount
        countspamfinal = countspam.merge(countspamtrain,how = 'left',on = ['ProductId','ProfileName'])
        countspamfinal.fillna(0,inplace = True)
        countspamfinal['SpamReviewCount'] = countspamfinal.SpamReviewCounttrain + countspamfinal.SpamReviewCountadd
        countspamfinal.drop(['SpamReviewCounttrain','SpamReviewCountadd'],axis = 1,inplace = True)
        
        engineered = pd.merge(
                        pd.merge(
                            pd.merge(data,countproffinal,how='left',on='ProfileName'),
                            countprodfinal,how = 'left', on = 'ProductId'),
                        countspamfinal, how = 'left',on = ['ProductId','ProfileName'])
        return engineered  
    

In [6]:
xtrainpreprocessed = preprocess_engineer(xtrain,train = True)
xtestpreprocessed = preprocess_engineer(xtest,train = False)
xvalpreprocessed = preprocess_engineer(xval,train = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['ProfileName'] = data['ProfileName'].apply(lambda x: 'Anonymous' if


In [7]:
#creating a custom Word2Vec model
class CustomW2V(object):
    def __init__(self,word2vec):
        self.word2vec = word2vec
        #if a text is empty we should return a vector of zeros
        #with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self,x):
        return self
    def transform(self,x):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                   or [np.zeros(self.dim)],axis = 0)
            for words in x
        ])

model = Word2Vec(xtrainpreprocessed.Text.apply(tokenize),
                 sg = 0,
                 hs = 1,
                 seed = 42,
                 vector_size = 128)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))
modelw = CustomW2V(w2v)

#convert to text 
xtrainvect = modelw.transform(xtrainpreprocessed.Text.apply(tokenize))
xtestvect = modelw.transform(xtestpreprocessed.Text.apply(tokenize))
xvalvect = modelw.transform(xvalpreprocessed.Text.apply(tokenize))


In [8]:
#add the remaining features
def vecttodata(aftervec,beforevec):
    df = pd.DataFrame(aftervec)
    df['HelpfulnessNumerator'] = beforevec.reset_index(drop=True)['HelpfulnessNumerator']
    df['HelpfulnessDenominator'] = beforevec.reset_index(drop=True)['HelpfulnessNumerator']
    df['Time'] = beforevec.reset_index(drop=True)['Time']
    df['ProfReviewCount'] = beforevec.reset_index(drop=True)['ProfReviewCount']
    df['ProdReviewCount'] = beforevec.reset_index(drop=True)['ProdReviewCount']
    df['SpamReviewCount'] = beforevec.reset_index(drop=True)['SpamReviewCount'] 
    
    return df
    
xtraindf = vecttodata(xtrainvect,xtrainpreprocessed)
xtestdf = vecttodata(xtestvect,xtestpreprocessed)
xvaldf = vecttodata(xvalvect,xvalpreprocessed)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_sample_weight

scaler = StandardScaler()

sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=ytrain
)

logreg = LogisticRegression(random_state = 42)
logreg.fit(scaler.fit_transform(xtraindf),ytrain,sample_weight = sample_weights)
pred = logreg.predict(scaler.transform(xtestdf))
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(pred,ytest, target_names = ['Negative','Neutral','Positive']))

In [10]:
#now to test against purely another unseen data

finalpred = logreg.predict(scaler.transform(xvaldf))
print(classification_report(finalpred,yval, target_names = ['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.63      0.67      0.65     17535
     Neutral       0.84      0.17      0.28     60099
    Positive       0.13      0.95      0.23      7635

    accuracy                           0.34     85269
   macro avg       0.53      0.60      0.39     85269
weighted avg       0.73      0.34      0.35     85269



In [11]:
ytrain.value_counts()/len(ytrain)

 1    0.638322
-1    0.220018
 0    0.141660
Name: Sentiment, dtype: float64

In [12]:
model.wv.most_similar(positive = ['good'])

[('titanium', 0.4521552324295044),
 ('silicon', 0.4399755299091339),
 ('tnr', 0.4061760902404785),
 ('clasico', 0.36855122447013855),
 ('cornmeal', 0.36003872752189636),
 ('vine', 0.3571547567844391),
 ('btwn', 0.35447293519973755),
 ('sulfur', 0.35143882036209106),
 ('sulphur', 0.35117125511169434),
 ('bread', 0.3475218713283539)]

In [13]:
model.wv.similarity('great','awesome')

0.7885308

In [14]:
#cosine similarity manual
v1 = model.wv['great']
v2 = model.wv['awesome']
np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

0.7885309

In [15]:
np.corrcoef(v1,v2)

array([[1.        , 0.78950328],
       [0.78950328, 1.        ]])