In [5]:
import pickle
import re
import spacy
from spacy import displacy
import pandas as pd
import numpy as np
import pickle
import time
nlp = spacy.load("en_core_web_sm")

#=====================================================#
# Function for time
#=====================================================#
def convert(seconds): 
    seconds = seconds % (24 * 3600) 
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
      
    return "%d:%02d:%02d" % (hour, minutes, seconds) 

"""

#=====================================================#
# Extract details from csv file
#=====================================================#
data = pd.read_csv('reviews.csv',index_col=[0])
data = data.fillna('-') # fill none with '-'
data.replace({'\n': ''}, inplace=True, regex=True) # remove break line

"""

#=====================================================#
# Extract details from pkl file
#=====================================================#
data = pd.read_pickle("restaurant_reviews.pkl")
data = data.fillna('-') # fill none with '-'
data.replace({'\n': ''}, inplace=True, regex=True) # remove break line

## Create column for food names per review

In [11]:
#=====================================================#
# Extract food names from text file
#=====================================================#
start = time.time()

with open('food_list_V1.txt',encoding="utf8") as f:
    food_list = [x.strip() for x in f.readlines()]

food_tag_list = []
for i in range(data.shape[0]):
    rev = data['Body'].iloc[i]
    #rev = rev.replace('and ', '').replace('& ', '')
    doc = nlp(rev)
    tag = []  
    food_tag = []
    
    # For each word in the review
    for food in food_list:
        if food in doc.text:
            food_tag.append(food)
    food_tag_list.append(','.join(food_tag))

#=====================================================#
# Store food name into dataframe
#=====================================================#
data['Food'] = food_tag_list

print('Time taken:', convert(time.time()-start))

Time taken: 1:03:11


In [12]:
data

Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Food
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3,9,10m ago,/babasan-by-uncle-kiisu,"beer,fruit"
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3,10,5d ago,/babasan-by-uncle-kiisu,rice
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8,958,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,"crispy pork,mee siam,pork,pork belly"
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7,208,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,"fish,fish roe,rice,roe,seafood,unagi"
0,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3,15,Oct 25 at 10:13pm,/yobo,"brown rice,cream,pumpkin,rice,salad"
...,...,...,...,...,...,...,...,...
7,Yummy meat galore shabu shabu buffet for the s...,Yummy meat galore shabu shabu buffet for the s...,Alison Ong,7,229,"Dec 16, 2013",/sukiya,"earl,meat,shabu shabu"
8,Our #steamboat #buffet #dinner before watching...,Our #steamboat #buffet #dinner before watching...,Benjamin Yeo,6,170,"Jun 7, 2013",/sukiya,"steamboat,tea"
9,My favorite whenever im here for steamboat! Lo...,,June Lee,8,989,"Aug 29, 2012",/sukiya,
10,Sukiyaki and Kimchi Based Hotpot Buffet,,Jasmine Low,6,187,"Aug 28, 2012",/sukiya,


## Create columns for restaurant and food reviews by splitting full reviews 

In [14]:
#=====================================================#
# Extract restaurant and food reviews
#=====================================================#
start = time.time()

## Function set custom boundaries to break into phases or sentences
def set_custom_boundaries(doc):
    link_words = ['but','however','unlike', 'even though', 'despite', 'in spite of', 'unlike', 'whereas', 'while']
    for token in doc[:-1]:
        if token.text in link_words:
            doc[token.i].is_sent_start = True
    return doc

## Set spacy pipeline
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(set_custom_boundaries, before="parser")

## Get Restaurant and Food reviews
res_rev_list = []
food_rev_list = []
for i in range(data.shape[0]):
    rev = data['Body'].iloc[i]
    rev = re.sub(r'[@#]\w+', '', rev) # remove hashtag
    doc = nlp(rev)
    glist = []
    flist = []
    for sent in doc.sents: # for each phase/sentence
        glist.append(sent.text.lower()) # add all phase/sentence to generic list
        for s in food_list: # for each food name
            if s in sent.text.lower(): # check if food name appear in phase/sentence
                flist.append(sent.text.lower()) # add phase/sentence with food name to food list
                try:
                    glist.remove(sent.text.lower()) # remove phase/sentence with food name from generic list
                except:
                    pass
    flist=list(dict.fromkeys(flist)) # remove duplicates
    glist=list(dict.fromkeys(glist)) # remove duplicates
    food_rev_list.append(' '.join(flist))
    res_rev_list.append(' '.join(glist))


#=====================================================#
# Store segmented reviews into dataframe
#=====================================================#
data['RestaurantReview'] = res_rev_list
data['FoodReview'] = food_rev_list

print('Time taken:', convert(time.time()-start))

data

Time taken: 1:05:16


Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Food,RestaurantReview,FoodReview
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3,9,10m ago,/babasan-by-uncle-kiisu,"beer,fruit",we ordered two appetisers and four main dishes...,kronenbourg blanc beer was very enjoyable too....
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3,10,5d ago,/babasan-by-uncle-kiisu,rice,boy did it impress! perfect harmonious blend o...,this rice in soup dish is the first one i’ve h...
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8,958,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,"crispy pork,mee siam,pork,pork belly","the successor to uncle kiisu, babasan’s menu d...",i was seriously wondering how they would pull ...
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7,208,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,"fish,fish roe,rice,roe,seafood,unagi",this was like a little bomb that exploded with...,i'm not a big fan of the seafoody taste loved ...
0,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3,15,Oct 25 at 10:13pm,/yobo,"brown rice,cream,pumpkin,rice,salad",but i really enjoyed the broccoli puree served...,most salad/grain bowls serve up similar stuff ...
...,...,...,...,...,...,...,...,...,...,...
7,Yummy meat galore shabu shabu buffet for the s...,Yummy meat galore shabu shabu buffet for the s...,Alison Ong,7,229,"Dec 16, 2013",/sukiya,"earl,meat,shabu shabu",,yummy meat galore shabu shabu buffet for the s...
8,Our #steamboat #buffet #dinner before watching...,Our #steamboat #buffet #dinner before watching...,Benjamin Yeo,6,170,"Jun 7, 2013",/sukiya,"steamboat,tea",our before watching,
9,My favorite whenever im here for steamboat! Lo...,,June Lee,8,989,"Aug 29, 2012",/sukiya,,,
10,Sukiyaki and Kimchi Based Hotpot Buffet,,Jasmine Low,6,187,"Aug 28, 2012",/sukiya,,,


## Extract features using spaCy- POS tag and Dependency parser
spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.
- https://spacy.io/usage/linguistic-features#pos-tagging
- https://spacy.io/usage/linguistic-features#dependency-parse

In [15]:
#=====================================================#
# Extract features from reviews
#=====================================================#

# Get list of words to be removed from reviews
stop_words = spacy.lang.en.stop_words.STOP_WORDS
remove_words = ["n't","'m","'s","'ll","'ve","'d","'re","n’t","’m","’s","’ll","’ve","’d","’re"]

def features_spacy(col_name):
    # For each word in the review
    feature_list = []
    text_list = []
    pos_list = []
    dep_list = []
    #dep_head_list=[]
    #dep_child_list=[]
    #dep_head_pos_list=[]

    for i in range(data.shape[0]):
        rev = data[col_name].iloc[i]
        rev = rev.replace('and ', '').replace('& ', '')
        doc = nlp(rev)
        feature = []    

        # For each word in the review
        for w in doc:

            # Check if word not from stop words
            if w.text.lower() not in stop_words and w.text.lower() not in remove_words:
                extract = None
                text_list.append(w.text.lower())
                pos_list.append(w.pos_)
                dep_list.append(w.dep_)
                #dep_head_list.append(w.head.text)
                #dep_head_pos_list.append(w.head.pos_)
                #dep_child_list.append([child for child in w.children])

                # Check for more than two words    
                if len(text_list) > 2: 

                    # Rule 1 => e.g. 'affordable', 'appetising'
                    if w.pos_ in ['ADJ', 'VERB'] or w.dep_ == "amod": #and w.head.pos_ in ['NOUN']:
                        extract = text_list[-1]

                    # Rule 2 => e.g. 'not good', 'really affordable'
                    if w.pos_ in ['ADJ', 'VERB'] and pos_list[-2] in ['ADV']: # and w.head.pos_ in ['NOUN']:
                        extract = text_list[-2] + ' ' + text_list[-1]

                    # Rule 3 => e.g. 'not the best'
                    if w.pos_ in ['ADJ', 'VERB'] and pos_list[-2] in ['DET'] and pos_list[-3] in ['ADV']: # and w.head.pos_ in ['NOUN']:
                        extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]

                    # Rule 4 => e.g. 'not too good'
                    if w.pos_ in ['ADJ', 'VERB'] and pos_list[-2] in ['ADV'] and pos_list[-3] in ['ADV']: # and w.head.pos_ in ['NOUN']:
                        extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]

                    if extract != None:
                        feature.append(extract)
                        
        feature = list(dict.fromkeys(feature)) # remove duplicates
        feature_list.append(','.join(feature)) # create list of sentiment feature for each review
        
    return feature_list


#=====================================================#
# Store features into dataframe
#=====================================================#
start = time.time()

data['RestaurantFeature(spacy)'] = features_spacy('RestaurantReview')
data['FoodFeature(spacy)'] = features_spacy('FoodReview')

print('Time taken:', convert(time.time()-start))

data

Time taken: 0:16:18


Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy)
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3,9,10m ago,/babasan-by-uncle-kiisu,"beer,fruit",we ordered two appetisers and four main dishes...,kronenbourg blanc beer was very enjoyable too....,"main,unique,boasting,manifested,creative,strong","enjoyable,sweet,mild"
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3,10,5d ago,/babasan-by-uncle-kiisu,rice,boy did it impress! perfect harmonious blend o...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good"
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8,958,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,"crispy pork,mee siam,pork,pork belly","the successor to uncle kiisu, babasan’s menu d...",i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender..."
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7,208,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,"fish,fish roe,rice,roe,seafood,unagi",this was like a little bomb that exploded with...,i'm not a big fan of the seafoody taste loved ...,"little,exploded,eating,try,super worth","big,loved,flying,smooth"
0,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3,15,Oct 25 at 10:13pm,/yobo,"brown rice,cream,pumpkin,rice,salad",but i really enjoyed the broccoli puree served...,most salad/grain bowls serve up similar stuff ...,"enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste..."
...,...,...,...,...,...,...,...,...,...,...,...,...
7,Yummy meat galore shabu shabu buffet for the s...,Yummy meat galore shabu shabu buffet for the s...,Alison Ong,7,229,"Dec 16, 2013",/sukiya,"earl,meat,shabu shabu",,yummy meat galore shabu shabu buffet for the s...,,"early,b4"
8,Our #steamboat #buffet #dinner before watching...,Our #steamboat #buffet #dinner before watching...,Benjamin Yeo,6,170,"Jun 7, 2013",/sukiya,"steamboat,tea",our before watching,,watching,
9,My favorite whenever im here for steamboat! Lo...,,June Lee,8,989,"Aug 29, 2012",/sukiya,,,,,
10,Sukiyaki and Kimchi Based Hotpot Buffet,,Jasmine Low,6,187,"Aug 28, 2012",/sukiya,,,,,


# Sentiment Class prediction on Burpple reviews

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

#=====================================================#
# Load the model from disk
#=====================================================#
filename = 'lr_model.sav'
loaded_model = joblib.load(filename) #load model 


#=====================================================#
# Extract details from csv file
#=====================================================#
#data = pd.read_csv('reviews.csv',index_col=[0])
#data = data.fillna('-') # fill none with '-'
#data.replace({'\n': ''}, inplace=True, regex=True) # remove break line

#X = data['Review']
X = data['Body']
data['SentimentClass'] = loaded_model.predict(X)
y = data['SentimentClass']

#=====================================================#
# Train the model (after sentiment score only)
#=====================================================#
tfidf = TfidfVectorizer(stop_words = 'english') #for converting reviews to matrix of TF-IDF features
lr = LogisticRegression(random_state=0, solver='lbfgs') #using logistic regression as prediction model
clf_pipeline = Pipeline([('tfidf', tfidf), ('lr', lr)]) #create pipeline of vectorizing and prediction steps
clf_pipeline.fit(X, y) #fit training data into pipeline


#=====================================================#
# Extract postive and negative sentiments
#=====================================================#
num = 100 #number of features

word = tfidf.get_feature_names() #get sentiment words/ features
coef = clf_pipeline.named_steps['lr'].coef_.tolist()[0] #get coefficient of features (feature importances) 
coeff_df = pd.DataFrame({'Word' : word, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient'], ascending=[False]) #sort descending by coefficient.

print('\n'+'Top {} positive sentiments'.format(num))
print(coeff_df.head(num).to_string(index=False))
print('\n'+'Top {} negative sentiments'.format(num))   
print(coeff_df.tail(num).to_string(index=False))



Top 100 positive sentiments
         Word  Coefficient
         good     5.394372
         nice     4.664474
         best     4.602491
         love     4.413196
        great     4.370333
    delicious     4.316622
        yummy     3.623320
      perfect     3.196048
       crispy     3.189819
   definitely     2.982767
      amazing     2.850342
        fresh     2.796694
       smooth     2.768042
      overall     2.734124
         rich     2.709265
        tasty     2.668799
    favourite     2.658364
        right     2.600073
   refreshing     2.551020
        loved     2.439449
       served     2.406420
        lunch     2.390151
          tad     2.295448
         eggs     2.246962
       dishes     2.233829
       tender     2.224399
      enjoyed     2.116993
          bit     2.110187
       highly     2.052429
        comes     2.045065
      awesome     2.008492
         soft     1.982405
        spicy     1.879361
   affordable     1.870659
    singapore     1.862325

In [34]:
#=====================================================#
# Export to csv
#=====================================================#
data.to_csv('restaurant_reviews_withsentiment.csv',index=False)

## Extract features using VADER (Valence Aware Dictionary and sEntiment Reasoner)

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. Support for emoji recognition (UTF-8 encoded).
- https://github.com/cjhutto/vaderSentiment

In [18]:
"""
#=====================================================#
# Extract features from reviews
#=====================================================#
import nltk
#nltk.download('punkt')
#nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize, RegexpTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# load analyzer and update lexicon 
analyzer = SentimentIntensityAnalyzer()
analyzer.lexicon.update({u'affordable': 2,
                         u'appetising': 2,
                         u'tasteless': -2})

# Get list of words to be removed from reviews
stop_words = spacy.lang.en.stop_words.STOP_WORDS
remove_words = ["n't","'m","'s","'ll","'ve","'d","'re","n’t","’m","’s","’ll","’ve","’d","’re"]

def features_vader(col_name):
    # For each word in the review
    feature_list = []
    score_list = []
    text_list = []

    for i in range(data.shape[0]):
        rev = data[col_name].iloc[i]
        #rev = rev.replace('and ', '').replace('& ', '')
        doc = nlp(rev)
        feature = []    

        # For each word in the review
        for w in doc:

            # Check if word not from stop words
            if w.text.lower() not in stop_words and w.text.lower() not in remove_words:
                extract = None
                text_list.append(w.text.lower())

                # Check for more than two words    
                if len(text_list) > 2: 

                    # Rule 1
                    if (analyzer.polarity_scores(w.text)['compound']) >= 0.1 or (analyzer.polarity_scores(w.text)['compound']) <= -0.1:
                        extract = text_list[-1]

                    if extract != None:
                        feature.append(extract)
                        
        feature = list(dict.fromkeys(feature)) # remove duplicates
        feature_list.append(','.join(feature)) # create list of sentiment feature for each review

        score = analyzer.polarity_scores(' '.join(rev)) # get sentiment score
        score_list.append(score['compound']) # create list of sentiment score for each review
        
        '''
        The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according
        to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive).
        '''
             
    return feature_list, score_list


#=====================================================#
# Store features into dataframe
#=====================================================#

start = time.time()

data['RestaurantFeature(vader)'], data['RestaurantScore(vader)'] = features_vader('RestaurantReview')
data['FoodFeature(vader)'], data['FoodScore(vader)']  = features_vader('FoodReview')

print('Time taken:', convert(time.time()-start))


#=====================================================#
# Export to csv
#=====================================================#
data.to_csv('restaurant_reviews_withsentiment.csv')

data

"""