In [1]:
import pickle
import re
import spacy
from spacy import displacy
import pandas as pd
import numpy as np
import pickle
import time
nlp = spacy.load("en_core_web_sm")

#=====================================================#
# Function for time
#=====================================================#
def convert(seconds): 
    seconds = seconds % (24 * 3600) 
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
      
    return "%d:%02d:%02d" % (hour, minutes, seconds) 

"""

#=====================================================#
# Extract details from csv file
#=====================================================#
data = pd.read_csv('reviews.csv',index_col=[0])
data = data.fillna('-') # fill none with '-'
data.replace({'\n': ''}, inplace=True, regex=True) # remove break line

"""

#=====================================================#
# Extract details from pkl file
#=====================================================#
data = pd.read_pickle("restaurant_reviews.pkl")
data = data.fillna('-') # fill none with '-'
data.replace({'\n': ''}, inplace=True, regex=True) # remove break line
data['Review'] = data[['Title', 'Body']].apply(lambda x: '. '.join(x.astype(str)), axis=1) # join text from title and body
data.head()


## Create column for food names per review

In [4]:
#=====================================================#
# Extract food names from text file
#=====================================================#
start = time.time()

with open('food_list_V2.txt',encoding="utf8") as f:
    food_list = [x.strip() for x in f.readlines()]

food_tag_list = []
for i in range(data.shape[0]):
    rev = data['Review'].iloc[i]
    #rev = rev.replace('and ', '').replace('& ', '')
    doc = nlp(rev)
    tag = []  
    food_tag = []
    
    # For each word in the review
    for food in food_list:
        if food in doc.text.lower():
            food_tag.append(food)
    food_tag_list.append(','.join(food_tag))

#=====================================================#
# Store food name into dataframe
#=====================================================#
data['Food'] = food_tag_list

print('Time taken:', convert(time.time()-start))

data.head()

Time taken: 1:14:27


Unnamed: 0,Title,Body,Reviewer,Reviewer_ID,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy),RestaurantFeature(online),FoodFeature(online),RestaurantSentimentProba,FoodSentimentProba,OverallSentimentClass,Cuisine
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,U0001,3.0,9.0,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild",unique,"enjoyable,fruity,mild,strong,sweet",0.98154,0.80515,1,
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,U0002,3.0,10.0,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good","first,good,great,style","fresh,overpowering",0.96336,0.993274,1,
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,U0003,8.0,958.0,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender...","affordable,good,heavy","crispy,heavy,icy,rich,spicy,stuffed,tangy,tast...",0.986915,0.959834,1,
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,U0004,7.0,208.0,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,kueh pie tee,pie,rice,roe,seafoo...",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth","big,little","smooth,texture",0.95946,0.964552,1,
4,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,U0005,3.0,15.0,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste...","good,small","creamy,roasted",0.915978,0.97214,1,


## Create columns for restaurant and food reviews by splitting full reviews 

In [4]:
#=====================================================#
# Extract restaurant and food reviews
#=====================================================#
start = time.time()

## Function set custom boundaries to break into phases or sentences
def set_custom_boundaries(doc):
    link_words = ['but','however','unlike', 'even though', 'despite', 'in spite of', 'unlike', 'whereas', 'while']
    for token in doc[:-1]:
        if token.text in link_words:
            doc[token.i].is_sent_start = True
    return doc

## Set spacy pipeline
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(set_custom_boundaries, before="parser")

## Get Restaurant and Food reviews
res_rev_list = []
food_rev_list = []
for i in range(data.shape[0]):
    rev = data['Review'].iloc[i]
    rev = re.sub(r'[@#]\w+', '', rev) # remove hashtag
    doc = nlp(rev)
    glist = []
    flist = []
    for sent in doc.sents: # for each phase/sentence
        glist.append(sent.text.lower()) # add all phase/sentence to generic list
        for s in food_list: # for each food name
            if s in sent.text.lower(): # check if food name appear in phase/sentence
                flist.append(sent.text.lower()) # add phase/sentence with food name to food list
                try:
                    glist.remove(sent.text.lower()) # remove phase/sentence with food name from generic list
                except:
                    pass
    flist=list(dict.fromkeys(flist)) # remove duplicates
    glist=list(dict.fromkeys(glist)) # remove duplicates
    food_rev_list.append(' '.join(flist))
    res_rev_list.append(' '.join(glist))


#=====================================================#
# Store segmented reviews into dataframe
#=====================================================#
data['RestaurantReview'] = res_rev_list
data['FoodReview'] = food_rev_list

print('Time taken:', convert(time.time()-start))

data.head()

Time taken: 1:13:14


Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3,9,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3,10,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8,958,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7,208,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,rice,roe,seafood,unagi",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...
0,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3,15,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...


## Extract features using spaCy- POS tag and Dependency parser
spaCy is a free open-source library for Natural Language Processing in Python. It features NER, POS tagging, dependency parsing, word vectors and more.
- https://spacy.io/usage/linguistic-features#pos-tagging
- https://spacy.io/usage/linguistic-features#dependency-parse

In [5]:
#=====================================================#
# Extract features from reviews
#=====================================================#

# Get list of words to be removed from reviews
stop_words = spacy.lang.en.stop_words.STOP_WORDS
remove_words = ["n't","'m","'s","'ll","'ve","'d","'re","n’t","’m","’s","’ll","’ve","’d","’re"]

def features_spacy(col_name):
    # For each word in the review
    feature_list = []
    text_list = []
    pos_list = []
    dep_list = []
    #dep_head_list=[]
    #dep_child_list=[]
    #dep_head_pos_list=[]

    for i in range(data.shape[0]):
        rev = data[col_name].iloc[i]
        rev = rev.replace('and ', '').replace('& ', '')
        doc = nlp(rev)
        feature = []    

        # For each word in the review
        for w in doc:

            # Check if word not from stop words
            if w.text.lower() not in stop_words and w.text.lower() not in remove_words:
                extract = None
                text_list.append(w.text.lower())
                pos_list.append(w.pos_)
                dep_list.append(w.dep_)
                #dep_head_list.append(w.head.text)
                #dep_head_pos_list.append(w.head.pos_)
                #dep_child_list.append([child for child in w.children])

                # Check for more than two words    
                if len(text_list) > 2: 

                    # Rule 1 => e.g. 'affordable', 'appetising'
                    if w.pos_ in ['ADJ', 'VERB'] or w.dep_ == "amod": #and w.head.pos_ in ['NOUN']:
                        extract = text_list[-1]

                    # Rule 2 => e.g. 'not good', 'really affordable'
                    if w.pos_ in ['ADJ', 'VERB'] and pos_list[-2] in ['ADV']: # and w.head.pos_ in ['NOUN']:
                        extract = text_list[-2] + ' ' + text_list[-1]

                    # Rule 3 => e.g. 'not the best'
                    if w.pos_ in ['ADJ', 'VERB'] and pos_list[-2] in ['DET'] and pos_list[-3] in ['ADV']: # and w.head.pos_ in ['NOUN']:
                        extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]

                    # Rule 4 => e.g. 'not too good'
                    if w.pos_ in ['ADJ', 'VERB'] and pos_list[-2] in ['ADV'] and pos_list[-3] in ['ADV']: # and w.head.pos_ in ['NOUN']:
                        extract = text_list[-3] + ' ' + text_list[-2] + ' ' + text_list[-1]

                    if extract != None:
                        feature.append(extract)
                        
        feature = list(dict.fromkeys(feature)) # remove duplicates
        feature_list.append(','.join(feature)) # create list of sentiment feature for each review
        
    return feature_list


#=====================================================#
# Store features into dataframe
#=====================================================#
start = time.time()

data['RestaurantFeature(spacy)'] = features_spacy('RestaurantReview')
data['FoodFeature(spacy)'] = features_spacy('FoodReview')

print('Time taken:', convert(time.time()-start))

data.head()

Time taken: 0:18:19


Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy)
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3,9,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild"
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3,10,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good"
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8,958,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender..."
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7,208,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,rice,roe,seafood,unagi",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth"
0,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3,15,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste..."


# Sentiment Class prediction on Burpple reviews

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

#=====================================================#
# Load the model from disk
#=====================================================#
filename = 'lr_model.sav'
loaded_model = joblib.load(filename) #load model 


#=====================================================#
# Extract details from csv file
#=====================================================#
#data = pd.read_csv('reviews.csv',index_col=[0])
#data = data.fillna('-') # fill none with '-'
#data.replace({'\n': ''}, inplace=True, regex=True) # remove break line

data['RestaurantSentimentProba'] = loaded_model.predict_proba(data['RestaurantReview'])[:,1] #probability of the label being 1 (positive)
data['FoodSentimentProba'] = loaded_model.predict_proba(data['FoodReview'])[:,1] #probability of the label being 1 (positive)

#=====================================================#
# Train the model (after sentiment score only)
#=====================================================#
X = data['Review']
data['OverallSentimentClass'] = loaded_model.predict(X) # predict overall sentiment class
y = data['OverallSentimentClass']
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words = 'english') #for converting reviews to matrix of TF-IDF features
lr = LogisticRegression(random_state=0, solver='lbfgs') #using logistic regression as prediction model
clf_pipeline = Pipeline([('tfidf', tfidf), ('lr', lr)]) #create pipeline of vectorizing and prediction steps
clf_pipeline.fit(X, y) #fit training data into pipeline


#=====================================================#
# Extract postive and negative sentiments
#=====================================================#
num = 100 #number of features

word = tfidf.get_feature_names() #get sentiment words/ features
coef = clf_pipeline.named_steps['lr'].coef_.tolist()[0] #get coefficient of features (feature importances) 
coeff_df = pd.DataFrame({'Word' : word, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient'], ascending=[False]) #sort descending by coefficient.

print('\n'+'Top {} positive sentiments'.format(num))
print(coeff_df.head(num).to_string(index=False))
print('\n'+'Top {} negative sentiments'.format(num))   
print(coeff_df.tail(num).to_string(index=False))

data.head()




Top 100 positive sentiments
        Word  Coefficient
        good     4.751683
        love     3.838034
        best     3.794704
       great     3.302394
   delicious     3.067751
        nice     2.985088
       yummy     2.687829
       lunch     2.582941
  definitely     2.484152
      crispy     2.465541
        soft     2.230553
       fresh     2.210423
     perfect     2.155860
       spicy     2.096354
       tasty     2.046983
   favourite     2.046473
       sweet     2.003551
      tender     1.949123
        eggs     1.827858
   ice cream     1.811026
     amazing     1.748107
         mee     1.727016
       loved     1.715905
       fried     1.711677
   singapore     1.709236
      smooth     1.681581
      dinner     1.680362
        rich     1.674044
     grilled     1.667852
       cream     1.581107
  refreshing     1.537940
         ice     1.446009
       right     1.430101
     awesome     1.405452
       comes     1.395758
       light     1.391762
   availa

Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy),RestaurantSentimentProba,FoodSentimentProba,OverallSentimentClass
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3,9,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild",0.98154,0.80515,1
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3,10,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good",0.96336,0.993274,1
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8,958,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender...",0.986915,0.959834,1
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7,208,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,rice,roe,seafood,unagi",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth",0.95946,0.964552,1
0,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3,15,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste...",0.915978,0.97214,1


## Extract features using online sources

#### Restaurant
- http://adjective1.com/for-restaurants/
- http://adjective1.com/for-ambience/
- http://adjective1.com/for-dining/

#### Food
- https://kathytemean.wordpress.com/2009/04/25/101-descriptive-words-for-foods/
- https://www.webstaurantstore.com/article/53/how-to-write-a-menu.html

In [5]:
#=====================================================#
# Extract details from csv file
#=====================================================#

#data = pd.read_csv('restaurant_reviews_withsentiment_null.csv')


#=====================================================#
# Extract restaurant descriptive words from text file
#=====================================================#
start = time.time()

with open('restaurant_descriptive.txt') as f:
    descriptive_list = [x.strip() for x in f.readlines()]

restaurant_feature_list = []
for i in range(data.shape[0]):
    rev = data['Review'].iloc[i]
    #rev = rev.replace('and ', '').replace('& ', '')
    doc = nlp(rev)
    tag = []  
    restaurant_feature = []
    
    # For each word in the review
    for feature in descriptive_list:
        if feature in doc.text.lower():
            restaurant_feature.append(feature)
    restaurant_feature_list.append(','.join(restaurant_feature))

#=====================================================#
# Store restaurant features into dataframe
#=====================================================#
data['RestaurantFeature(online)'] = restaurant_feature_list

print('Time taken:', convert(time.time()-start))

data.head()

Time taken: 0:21:03


Unnamed: 0,Title,Body,Reviewer,Reviewer_ID,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy),RestaurantFeature(online),FoodFeature(online),RestaurantSentimentProba,FoodSentimentProba,OverallSentimentClass,Cuisine
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,U0001,3.0,9.0,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild",unique,"enjoyable,fruity,mild,strong,sweet",0.98154,0.80515,1,
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,U0002,3.0,10.0,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good","good,great,style","fresh,overpowering",0.96336,0.993274,1,
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,U0003,8.0,958.0,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender...","affordable,good","crispy,heavy,icy,rich,spicy,stuffed,tangy,tast...",0.986915,0.959834,1,
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,U0004,7.0,208.0,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,kueh pie tee,pie,rice,roe,seafoo...",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth","big,little","smooth,texture",0.95946,0.964552,1,
4,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,U0005,3.0,15.0,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste...","good,small","creamy,roasted",0.915978,0.97214,1,


In [6]:
#=====================================================#
# Extract details from csv file
#=====================================================#

#data = pd.read_csv('restaurant_reviews_withsentiment_null.csv')


#=====================================================#
# Extract food descriptive words from text file
#=====================================================#
start = time.time()

with open('food_descriptive.txt') as f:
    descriptive_list = [x.strip() for x in f.readlines()]

food_feature_list = []
for i in range(data.shape[0]):
    rev = data['Review'].iloc[i]
    #rev = rev.replace('and ', '').replace('& ', '')
    doc = nlp(rev)
    tag = []  
    food_feature = []
    
    # For each word in the review
    for feature in descriptive_list:
        if feature in doc.text.lower():
            food_feature.append(feature)
    food_feature_list.append(','.join(food_feature))

#=====================================================#
# Store food features into dataframe
#=====================================================#
data['FoodFeature(online)'] = food_feature_list

print('Time taken:', convert(time.time()-start))

data.head()

Time taken: 0:33:56


Unnamed: 0,Title,Body,Reviewer,Reviewer_ID,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy),RestaurantFeature(online),FoodFeature(online),RestaurantSentimentProba,FoodSentimentProba,OverallSentimentClass,Cuisine
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,U0001,3.0,9.0,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild",unique,"enjoyable,fruity,mild,strong,sweet",0.98154,0.80515,1,
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,U0002,3.0,10.0,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good","good,great,style","fresh,overpowering",0.96336,0.993274,1,
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,U0003,8.0,958.0,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender...","affordable,good","crispy,heavy,icy,rich,spicy,stuffed,tangy,tast...",0.986915,0.959834,1,
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,U0004,7.0,208.0,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,kueh pie tee,pie,rice,roe,seafoo...",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth","big,little","smooth,texture",0.95946,0.964552,1,
4,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,U0005,3.0,15.0,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste...","good,small","creamy,roasted",0.915978,0.97214,1,


In [19]:
#=====================================================#
# Extract details from csv file
#=====================================================#

#data = pd.read_csv('restaurant_reviews_withsentiment_null.csv')


#=====================================================#
# Export data with null/ empty fields to csv
#=====================================================#
print("Total reviews: ",len(data),"\n") #count rows

print("Number of null/ empty fields:")
#print(data.iloc[:, 0:].eq("").sum(axis=0),"\n") #count empty fields for each column
print(data.isnull().sum(axis = 0),"\n") #count null values for each column

print("Number of unique values:")
print(data.apply(pd.Series.nunique),"\n") #count unique values for each column

#data.to_csv('restaurant_reviews_withsentiment_null.csv',index=False) #export to csv

Total reviews:  63921 

Number of null/ empty fields:
Title                            0
Body                          5513
Reviewer                         0
Reviewer_ID                      0
Reviewer_Level                 825
Reviewer_NumReviews            825
ReviewDateTime                   0
Restaurant                       0
Review                           0
Food                             0
RestaurantReview              9466
FoodReview                    6017
RestaurantFeature(spacy)     16822
FoodFeature(spacy)           10030
RestaurantFeature(online)        0
FoodFeature(online)              0
RestaurantSentimentProba         0
FoodSentimentProba               0
OverallSentimentClass            0
Cuisine                       4063
RestaurantName                   0
dtype: int64 

Number of unique values:
Title                        57256
Body                         57726
Reviewer                      6037
Reviewer_ID                   6037
Reviewer_Level                 

In [23]:
#=====================================================#
# Remove rows with empty entries
#=====================================================#

"""
# Drop columns
data.drop(['Title','Body'], axis=1, inplace=True)
data.head()
"""

data_cleaned = data[(data['FoodFeature(online)'] != '') & (data['FoodFeature(online)'].notnull())]
data_cleaned = data_cleaned[(data_cleaned['RestaurantFeature(online)'] != '') & (data_cleaned['RestaurantFeature(online)'].notnull())]
data_cleaned = data_cleaned[(data_cleaned['FoodReview'] != '') & (data_cleaned['FoodReview'].notnull())]
data_cleaned = data_cleaned[(data_cleaned['RestaurantReview'] != '') & (data_cleaned['RestaurantReview'].notnull())]
#data_cleaned = data_cleaned[(data_cleaned['Food'] != '') & (data_cleaned['Food'].notnull())]

print("Total reviews: ",len(data_cleaned),"\n") #count rows

print("Number of null/ empty fields:")
#print(data_cleaned.iloc[:, 0:].eq("").sum(axis=0),"\n") #count empty fields for each column
print(data_cleaned.isnull().sum(axis = 0),"\n") #count null values for each column

print("Number of unique values:")
print(data.apply(pd.Series.nunique),"\n") #count unique values for each column


#=====================================================#
# Export data without null/ empty fields to csv
#=====================================================#
data_cleaned.to_csv('restaurant_reviews_withsentiment.csv',index=False)

Total reviews:  34229 

Number of null/ empty fields:
Title                           0
Body                          200
Reviewer                        0
Reviewer_ID                     0
Reviewer_Level                643
Reviewer_NumReviews           643
ReviewDateTime                  0
Restaurant                      0
Review                          0
Food                            0
RestaurantReview                0
FoodReview                      0
RestaurantFeature(spacy)     2634
FoodFeature(spacy)            724
RestaurantFeature(online)       0
FoodFeature(online)             0
RestaurantSentimentProba        0
FoodSentimentProba              0
OverallSentimentClass           0
Cuisine                      1915
RestaurantName                  0
dtype: int64 

Number of unique values:
Title                        57256
Body                         57726
Reviewer                      6037
Reviewer_ID                   6037
Reviewer_Level                  10
Reviewer_NumRevie

## Anonymize Users

In [56]:
import pandas as pd
from more_itertools import unique_everseen

#=====================================================#
# Extract details from csv file
#=====================================================#

data = pd.read_csv('restaurant_reviews_withsentiment_null.csv')


#=====================================================#
# Anonymize Users
#=====================================================#

# Get list of unique users
userlist = data.Reviewer.tolist()
unique_userlist = list(dict.fromkeys(userlist))
print("Total users: ", len(unique_userlist))

# Create unique id for each user
userdict = {k: "U"+str(i+1).zfill(len(str(len(unique_userlist)))) for i, k in enumerate(filter(lambda x: x not in "<", unique_everseen(unique_userlist)))}
print("User Dict: ", userdict)

Total users:  6037
User Dict:  {'Sofina Ng': 'U0001', 'Jan L': 'U0002', 'Dex Neo': 'U0003', 'Komal Salve': 'U0004', 'Nom Nom': 'U0005', 'Ray L': 'U0006', 'shirley L': 'U0007', 'Burpple Guides': 'U0008', 'Zac Lee': 'U0009', 'Gan Zhang Xun': 'U0010', 'Miss Ha ~': 'U0011', 'I makan Sg': 'U0012', 'Gwen Cheng': 'U0013', 'Wuu Yyiizzhhoouu': 'U0014', 'aggs & xi :D': 'U0015', 'Thechoyalicious :)': 'U0016', 'Cindy Chiah': 'U0017', 'Khaw Han Chung': 'U0018', 'serene goh': 'U0019', 'Shaun Sim': 'U0020', 'Nalene Lyhor': 'U0021', 'Fi T': 'U0022', 'Christine Ang': 'U0023', 'Cecil Dulam': 'U0024', 'The RantingPanda': 'U0025', 'Emma Fangs': 'U0026', 'Victoria Hii': 'U0027', 'Wei Zhi Chiang': 'U0028', 'James Ang': 'U0029', 'Joe Yang': 'U0030', 'Valerie Wong': 'U0031', 'Jonathan Lim': 'U0032', 'Grace Lxy': 'U0033', 'Vanessa Kou': 'U0034', 'ee jean': 'U0035', 'Triffany Lim': 'U0036', 'SheEats SheCooks': 'U0037', 'Seanna Lim': 'U0038', 'Jessica Soh': 'U0039', 'Jayson Yeo': 'U0040', 'Claire Chan': 'U0041',

In [57]:
# Get list of unique id (with duplicates)
userlist = data.Reviewer.tolist()

id_list = []
for name in userlist:
    id = userdict.get(name)
    id_list.append(id)
    
data['Reviewer_ID'] = id_list
data

Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy),RestaurantFeature(online),FoodFeature(online),RestaurantSentimentProba,FoodSentimentProba,OverallSentimentClass,Reviewer_ID
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3.0,9.0,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild",unique,"enjoyable,fruity,mild,strong,sweet",0.981540,0.805150,1,U0001
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3.0,10.0,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good","first,good,great,style","fresh,overpowering",0.963360,0.993274,1,U0002
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8.0,958.0,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender...","affordable,good,heavy","crispy,heavy,icy,rich,spicy,stuffed,tangy,tast...",0.986915,0.959834,1,U0003
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7.0,208.0,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,rice,roe,seafood,unagi",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth","big,little","smooth,texture",0.959460,0.964552,1,U0004
4,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3.0,15.0,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste...","good,small","creamy,roasted",0.915978,0.972140,1,U0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63916,Yummy meat galore shabu shabu buffet for the s...,Yummy meat galore shabu shabu buffet for the s...,Alison Ong,7.0,229.0,16-Dec-13,/sukiya,Yummy meat galore shabu shabu buffet for the s...,"earl,meat,shabu shabu",,yummy meat galore shabu shabu buffet for the s...,,"early,b4",,yummy,0.760298,0.947195,1,U2269
63917,Our #steamboat #buffet #dinner before watching...,Our #steamboat #buffet #dinner before watching...,Benjamin Yeo,6.0,170.0,7-Jun-13,/sukiya,Our #steamboat #buffet #dinner before watching...,"steamboat,tea",our before watching . our before watchi...,,watching,,,,0.932189,0.760298,1,U5766
63918,My favorite whenever im here for steamboat! Lo...,,June Lee,8.0,989.0,29-Aug-12,/sukiya,My favorite whenever im here for steamboat! Lo...,"steamboat,tea",look @ the oozing goodness 👍😃.,my favorite whenever im here for steamboat!,"look,oozing","favorite,m","favorite,good",oozing,0.581917,0.957029,1,U2273
63919,Sukiyaki and Kimchi Based Hotpot Buffet,,Jasmine Low,6.0,187.0,28-Aug-12,/sukiya,Sukiyaki and Kimchi Based Hotpot Buffet.,,,sukiyaki and kimchi based hotpot buffet.,,based,,hot,0.760298,0.617251,1,U2087


## Add cuisine in a column

In [58]:
#=====================================================#
# Extract details from csv file
#=====================================================#

cuisine = pd.read_csv('labels.csv')


#=====================================================#
# Left outer join on Restaurant name
#=====================================================#

cuisine['Restaurant'] = "/" + cuisine['Restaurant']
data = pd.merge(data,cuisine, on='Restaurant', how='left')

data

Unnamed: 0,Title,Body,Reviewer,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,RestaurantReview,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy),RestaurantFeature(online),FoodFeature(online),RestaurantSentimentProba,FoodSentimentProba,OverallSentimentClass,Reviewer_ID,Cuisine
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,3.0,9.0,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",glad it’s back!. we ordered two appetisers and...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild",unique,"enjoyable,fruity,mild,strong,sweet",0.981540,0.805150,1,U0001,
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,3.0,10.0,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,yuzu ochazuke ($24). boy did it impress! perfe...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good","first,good,great,style","fresh,overpowering",0.963360,0.993274,1,U0002,
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,8.0,958.0,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",belly roll mee ($15). the successor to uncle k...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender...","affordable,good,heavy","crispy,heavy,icy,rich,spicy,stuffed,tangy,tast...",0.986915,0.959834,1,U0003,
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,7.0,208.0,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,rice,roe,seafood,unagi",tee. this was like a little bomb that exploded...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth","big,little","smooth,texture",0.959460,0.964552,1,U0004,
4,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,3.0,15.0,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",build your own bowl (small). but i really enjo...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste...","good,small","creamy,roasted",0.915978,0.972140,1,U0005,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63916,Yummy meat galore shabu shabu buffet for the s...,Yummy meat galore shabu shabu buffet for the s...,Alison Ong,7.0,229.0,16-Dec-13,/sukiya,Yummy meat galore shabu shabu buffet for the s...,"earl,meat,shabu shabu",,yummy meat galore shabu shabu buffet for the s...,,"early,b4",,yummy,0.760298,0.947195,1,U2269,
63917,Our #steamboat #buffet #dinner before watching...,Our #steamboat #buffet #dinner before watching...,Benjamin Yeo,6.0,170.0,7-Jun-13,/sukiya,Our #steamboat #buffet #dinner before watching...,"steamboat,tea",our before watching . our before watchi...,,watching,,,,0.932189,0.760298,1,U5766,
63918,My favorite whenever im here for steamboat! Lo...,,June Lee,8.0,989.0,29-Aug-12,/sukiya,My favorite whenever im here for steamboat! Lo...,"steamboat,tea",look @ the oozing goodness 👍😃.,my favorite whenever im here for steamboat!,"look,oozing","favorite,m","favorite,good",oozing,0.581917,0.957029,1,U2273,
63919,Sukiyaki and Kimchi Based Hotpot Buffet,,Jasmine Low,6.0,187.0,28-Aug-12,/sukiya,Sukiyaki and Kimchi Based Hotpot Buffet.,,,sukiyaki and kimchi based hotpot buffet.,,based,,hot,0.760298,0.617251,1,U2087,


## Add restaurant names in a column

In [17]:
#=====================================================#
# Extract details from csv file
#=====================================================#

restaurant_names = pd.read_csv('restaurant_names.csv')


#=====================================================#
# Left outer join on Restaurant name
#=====================================================#

data = pd.merge(data, restaurant_names, on='Restaurant', how='left')

data

Unnamed: 0,Title,Body,Reviewer,Reviewer_ID,Reviewer_Level,Reviewer_NumReviews,ReviewDateTime,Restaurant,Review,Food,...,FoodReview,RestaurantFeature(spacy),FoodFeature(spacy),RestaurantFeature(online),FoodFeature(online),RestaurantSentimentProba,FoodSentimentProba,OverallSentimentClass,Cuisine,RestaurantName
0,Glad It’s Back!,We ordered two appetisers and four main dishes...,Sofina Ng,U0001,3.0,9.0,10m ago,/babasan-by-uncle-kiisu,Glad It’s Back!. We ordered two appetisers and...,"beer,fruit",...,kronenbourg blanc beer was very enjoyable too....,"ordered,main,unique,boasting,manifested,creati...","enjoyable,sweet,mild",unique,"enjoyable,fruity,mild,strong,sweet",0.981540,0.805150,1,,Babasan by Uncle Kiisu
1,Yuzu Ochazuke ($24),This rice in soup dish is the first one I’ve h...,Jan L,U0002,3.0,10.0,5d ago,/babasan-by-uncle-kiisu,Yuzu Ochazuke ($24). This rice in soup dish is...,rice,...,this rice in soup dish is the first one i’ve h...,"impress,perfect,harmonious,fresh,generous,serv...","great,good","good,great,style","fresh,overpowering",0.963360,0.993274,1,,Babasan by Uncle Kiisu
2,Belly Roll Mee ($15),"The successor to Uncle Kiisu, Babasan’s menu d...",Dex Neo,U0003,8.0,958.0,Oct 24 at 5:42pm,/babasan-by-uncle-kiisu,Belly Roll Mee ($15). The successor to Uncle K...,"crispy pork,mee siam,pork,pork belly",...,i was seriously wondering how they would pull ...,"dreamt,good,sounds,heavy,provided,cut,amazingl...","seriously wondering,pull,crispy,stuffed,tender...","affordable,good","crispy,heavy,icy,rich,spicy,stuffed,tangy,tast...",0.986915,0.959834,1,,Babasan by Uncle Kiisu
3,Samurai Kueh Pie Tee,This was like a little bomb that exploded with...,Komal Salve,U0004,7.0,208.0,Oct 23 at 10:06am,/babasan-by-uncle-kiisu,Samurai Kueh Pie Tee. This was like a little b...,"fish,fish roe,kueh pie tee,pie,rice,roe,seafoo...",...,samurai kueh pie i'm not a big fan of the seaf...,"little,exploded,eating,try,super worth","big,loved,flying,smooth","big,little","smooth,texture",0.959460,0.964552,1,,Babasan by Uncle Kiisu
4,Build Your Own Bowl (Small),Most salad/grain bowls serve up similar stuff ...,Nom Nom,U0005,3.0,15.0,Oct 25 at 10:13pm,/yobo,Build Your Own Bowl (Small). Most salad/grain ...,"brown rice,cream,pumpkin,rice,salad",...,most salad/grain bowls serve up similar stuff ...,"build,small,enjoyed,served","salad,serve,similar,creamy,paired,brown,roaste...","good,small","creamy,roasted",0.915978,0.972140,1,,YoBo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63916,Yummy meat galore shabu shabu buffet for the s...,Yummy meat galore shabu shabu buffet for the s...,Alison Ong,U2269,7.0,229.0,16-Dec-13,/sukiya,Yummy meat galore shabu shabu buffet for the s...,"earl,meat,shabu shabu",...,yummy meat galore shabu shabu buffet for the s...,,"early,b4",,yummy,0.760298,0.947195,1,,Suki-Ya (I12 Katong)
63917,Our #steamboat #buffet #dinner before watching...,Our #steamboat #buffet #dinner before watching...,Benjamin Yeo,U5766,6.0,170.0,07-Jun-13,/sukiya,Our #steamboat #buffet #dinner before watching...,"steamboat,tea",...,,watching,,,,0.932189,0.760298,1,,Suki-Ya (I12 Katong)
63918,My favorite whenever im here for steamboat! Lo...,,June Lee,U2273,8.0,989.0,29-Aug-12,/sukiya,My favorite whenever im here for steamboat! Lo...,"steamboat,tea",...,my favorite whenever im here for steamboat!,"look,oozing","favorite,m","favorite,good",oozing,0.581917,0.957029,1,,Suki-Ya (I12 Katong)
63919,Sukiyaki and Kimchi Based Hotpot Buffet,,Jasmine Low,U2087,6.0,187.0,28-Aug-12,/sukiya,Sukiyaki and Kimchi Based Hotpot Buffet.,hotpot,...,sukiyaki and kimchi based hotpot buffet.,,based,,hot,0.760298,0.617251,1,,Suki-Ya (I12 Katong)


In [18]:
#=====================================================#
# Export data with Reviewer_ID and Cuisine to csv
#=====================================================#
data.to_csv('restaurant_reviews_withsentiment_null.csv',index=False)

## Extract features using VADER (Valence Aware Dictionary and sEntiment Reasoner)

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. Support for emoji recognition (UTF-8 encoded).
- https://github.com/cjhutto/vaderSentiment

In [None]:
"""
#=====================================================#
# Extract features from reviews
#=====================================================#
import nltk
#nltk.download('punkt')
#nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize, RegexpTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# load analyzer and update lexicon 
analyzer = SentimentIntensityAnalyzer()
analyzer.lexicon.update({u'affordable': 2,
                         u'appetising': 2,
                         u'tasteless': -2})

# Get list of words to be removed from reviews
stop_words = spacy.lang.en.stop_words.STOP_WORDS
remove_words = ["n't","'m","'s","'ll","'ve","'d","'re","n’t","’m","’s","’ll","’ve","’d","’re"]

def features_vader(col_name):
    # For each word in the review
    feature_list = []
    score_list = []
    text_list = []

    for i in range(data.shape[0]):
        rev = data[col_name].iloc[i]
        #rev = rev.replace('and ', '').replace('& ', '')
        doc = nlp(rev)
        feature = []    

        # For each word in the review
        for w in doc:

            # Check if word not from stop words
            if w.text.lower() not in stop_words and w.text.lower() not in remove_words:
                extract = None
                text_list.append(w.text.lower())

                # Check for more than two words    
                if len(text_list) > 2: 

                    # Rule 1
                    if (analyzer.polarity_scores(w.text)['compound']) >= 0.1 or (analyzer.polarity_scores(w.text)['compound']) <= -0.1:
                        extract = text_list[-1]

                    if extract != None:
                        feature.append(extract)
                        
        feature = list(dict.fromkeys(feature)) # remove duplicates
        feature_list.append(','.join(feature)) # create list of sentiment feature for each review

        score = analyzer.polarity_scores(' '.join(rev)) # get sentiment score
        score_list.append(score['compound']) # create list of sentiment score for each review
        
        '''
        The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according
        to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive).
        '''
             
    return feature_list, score_list


#=====================================================#
# Store features into dataframe
#=====================================================#

start = time.time()

data['RestaurantFeature(vader)'], data['RestaurantScore(vader)'] = features_vader('RestaurantReview')
data['FoodFeature(vader)'], data['FoodScore(vader)']  = features_vader('FoodReview')

print('Time taken:', convert(time.time()-start))


#=====================================================#
# Export to csv
#=====================================================#
data.to_csv('restaurant_reviews_withsentiment.csv')

data

"""