In [92]:
import pandas as pd
import numpy as npu

### Read in Aggregated Data

In [193]:
train = pd.read_csv("train_newpreproc_emoticon.csv")
val = pd.read_csv("val_newpreproc_emoticon.csv")
test = pd.read_csv("test_newpreproc_emoticon.csv")
trainval = pd.concat([train,val],axis=0)

FileNotFoundError: [Errno 2] No such file or directory: 'train_newpreproc_emoticon.csv'

In [175]:
trainval.shape,test.shape

((2486, 9), (775, 9))

### VADER

Developed in 2014, VADER (Valence Aware Dictionary and sEntiment Reasoner) is a pre-trained model that uses rule-based values tuned to sentiments from social media. It evaluates the text of a message and gives you an assessment of not just positive and negative, but the intensity of that emotion as well.

It uses a dictionary of terms that it can evaluate. From the GitHub repository this includes examples like:

Negations - a modifier that reverses the meaning of a phrase ("not great").
Contractions - negations, but more complex ("wasn’t great").
Punctuation - increased intensity ("It’s great!!!").
Slang - variations of slang words such as "kinda", "sux", or "hella".
It's even able to understand acronyms ("lol") and emoji (❤).



In [176]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/xinminaw/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [177]:
sid.polarity_scores("fresh")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.3182}

In [178]:
trainval["polarity_scores"] = trainval.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores"] = test.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test.head()

Unnamed: 0.1,Unnamed: 0,restaurant_code,review_title,account_name,new_aspect_1,phrase,phrase_lemma,phrase_stem,label,polarity_scores
0,0,109-teochew-yong-tau-foo,Ampang YTF!,Pearlyn Chua,ambience,airconditioned price point loving place ca wai...,airconditioned price point loving place ca wai...,aircondit price point love place ca wait visit,1,"{'neg': 0.0, 'neu': 0.642, 'pos': 0.358, 'comp..."
1,1,109-teochew-yong-tau-foo,Ampang YTF!,Pearlyn Chua,food,soup dry laksa fried rice fried chicken wings ...,soup dry laksa fried rice fried chicken wing b...,soup dri laksa fri rice fri chicken wing brown...,1,"{'neg': 0.063, 'neu': 0.663, 'pos': 0.274, 'co..."
2,2,109-teochew-yong-tau-foo,Ampang YTF!,Pearlyn Chua,time,snaking queue intimidated long cos moves fairl...,snaking queue intimidated long co move fairly ...,snake queue intimid long co move fairli fast,-1,"{'neg': 0.293, 'neu': 0.707, 'pos': 0.0, 'comp..."
3,3,109-teochew-yong-tau-foo,Delicious Yong Tau Foo,Simple Foodie,food,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,4,109-teochew-yong-tau-foo,Delicious Yong Tau Foo,Simple Foodie,time,clear queue,clear queue,clear queue,1,"{'neg': 0.0, 'neu': 0.278, 'pos': 0.722, 'comp..."


In [179]:
trainval["compound"] = trainval["polarity_scores"].map(lambda score_dict : score_dict["compound"])
test["compound"] = test["polarity_scores"].map(lambda score_dict : score_dict["compound"])

In [180]:
trainval["prediction"] = trainval["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction"] = test["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [181]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction,digits=4))

VADER Sentiment Analysis Model
TrainingValidation Data
              precision    recall  f1-score   support

          -1     0.4886    0.3213    0.3877       333
           0     0.6111    0.6510    0.6304       917
           1     0.7496    0.7824    0.7656      1236

    accuracy                         0.6722      2486
   macro avg     0.6164    0.5849    0.5946      2486
weighted avg     0.6635    0.6722    0.6651      2486

Test Data
              precision    recall  f1-score   support

          -1     0.5132    0.3750    0.4333       104
           0     0.5436    0.7156    0.6178       218
           1     0.8083    0.7351    0.7699       453

    accuracy                         0.6813       775
   macro avg     0.6217    0.6086    0.6070       775
weighted avg     0.6942    0.6813    0.6820       775



In [182]:
print("TrainingValidation Data")
print(confusion_matrix(trainval.label,trainval.prediction))
print("Test Data")
print(confusion_matrix(test.label,test.prediction))

TrainingValidation Data
[[107 154  72]
 [ 69 597 251]
 [ 43 226 967]]
Test Data
[[ 39  33  32]
 [ 15 156  47]
 [ 22  98 333]]


In [183]:
# Examine wrong class -1
trainval.loc[(trainval.label == -1) & (trainval.prediction != -1)].phrase.values

array(['laksa approx small kind laksa eat',
       'expect laksa taste good not exceptional sadly waste money taste good not exceptional sadly waste money',
       'small portion', 'felt kinda pricey kinda pricey portion',
       'small small bowl', 'air conditioning coild barely felt',
       'katong laksa bowl mediocre katong laksa', 'laksa laksa small',
       'expensive icecream',
       'sharing different scoops ice cream waffle buttermilk matcha red velvet bud confused sharing different scoops ice cream',
       'pricey cake larger',
       'toastvisited popular coffee stall high expectations tad stale dry kaya slightly sweet',
       'shiok price increase abit sian',
       'floss banh mistill hungry serving portion small',
       'soup salty oily beef dry surprising pho vietnamese tasted better',
       'tasting platter definitely not worth og price barely',
       'truffle fries tasted bland salmon fishy overcooked steak okay sauces meh burpple burrplesg surfandturf steak',
  

##### Update Lexicon Dictionary (Round 1)

In [184]:
new_food = {
    "tender" : 4,
    "fresh" : 4,
    "soggy" : -4,
    "jelat" : -4,
    "oily" : -4,
    "overcooked" :-4,
    "dry" : -2,
    "disappointed" : -4  
}

new_time = {
    "long queue" : -4,
    "queue" : -4,
    "wait" : -2,
    "slow" : -4,
    "crowd" : -4
}

new_price = {
    "pricey" : -4,
    "expensive" : -4,
    "cheap" : 4,
    "worth" : 4,
    "overpriced" : -4,
    "not worth" : -4,
    "value for money" : 4
    
}

new_portion = {
    "small" : -4,
    "large" : 4,
    "generous" : 4,
}

In [185]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [186]:
trainval["polarity_scores1"] = trainval.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores1"] = test.phrase.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound1"] = trainval["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
test["compound1"] = test["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
trainval["prediction1"] = trainval["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction1"] = test["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [187]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction1,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction1,digits=4))

VADER Sentiment Analysis Model (Tuned with new words)
TrainingValidation Data
              precision    recall  f1-score   support

          -1     0.5443    0.6276    0.5830       333
           0     0.6726    0.6183    0.6443       917
           1     0.7641    0.7783    0.7711      1236

    accuracy                         0.6991      2486
   macro avg     0.6603    0.6748    0.6661      2486
weighted avg     0.7009    0.6991    0.6992      2486

Test Data
              precision    recall  f1-score   support

          -1     0.5664    0.6154    0.5899       104
           0     0.5846    0.6972    0.6360       218
           1     0.8333    0.7395    0.7836       453

    accuracy                         0.7110       775
   macro avg     0.6614    0.6840    0.6698       775
weighted avg     0.7275    0.7110    0.7161       775



In [188]:
# Examine wrong class 1
trainval.loc[(trainval.label == 1) & (trainval.prediction1 != 1)].phrase.values

array(['instead went pig congee lo mai kai crystal dumplings total damage food photo rice normal sticky glutinous like not salty lomaikai porridge time stick congee smooth criminal bonus not salty',
       'laksadelicious laksa', 'cravings laksa goes signature katong',
       'laksa onsen egglovely edition egg', 'big laksa',
       'waiting time no waiting waiting time no waiting',
       'friday pig intestines big', 'affordable utilising burpple',
       'clay pot medium sauce dried chili slices ginger cloves mushy garlic heavenly supposed soggy decent dish overall',
       'attentive', 'reasonably priced portion', 'small price',
       'affordable option affordable option', 'runny beef patty juicy',
       'west burger patty little bit overdone rings crispy left long time addictive ordinary left long time addictive finish eating ingredients addictive finish eating',
       'came returned steak', 'tangy appetising',
       'goodnesstaste double boiled soup boiled soup',
       'chi so

##### Update Lexicon Dictionary (Round 2)

In [189]:
new_food = {
    "tender" : 2,
    "fresh" : 2,
    "soggy" : -2,
    "jelat" : -2,
    "oily" : -2,
    "overcooked" :-2,
    "dry" : -2,
    "disappointed" : -2,
    "cravings satisfied" : 2,
    "crispy" : 2,
    "sinful" : 2,
    "tough" : -2,
    "cold" : -2
}

new_time = {
    "long queue" : -2,
    "queue" : -2,
    "wait" : -2,
    "slow" : -2,
    "crowd" : -2,
    "crowded" : -2,
    "no waiting time" : 2,
    "fast" : 2,
}

new_price = {
    "pricey" : -2,
    "expensive" : -2,
    "cheap" : 2,
    "worth" : 2,
    "overpriced" : -2,
    "not worth" : -2,
    "value for money" : 2,
    "reasonable" : 2,
    "reasonably" : 2,
    "affordable" : 2,
    "steal" : 2   
}

new_portion = {
    "small" : -2,
    "large" : 2,
    "generous" : 2,
    "sufficient" : 1,
    "enough" : 1
}

In [190]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [191]:
trainval["polarity_scores2"] = trainval.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores2"] = test.phrase.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound2"] = trainval["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
test["compound2"] = test["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
trainval["prediction2"] = trainval["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction2"] = test["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [192]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words 2)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction2,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction2,digits=4))

VADER Sentiment Analysis Model (Tuned with new words 2)
TrainingValidation Data
              precision    recall  f1-score   support

          -1     0.5568    0.6036    0.5793       333
           0     0.6903    0.6052    0.6450       917
           1     0.7585    0.8107    0.7837      1236

    accuracy                         0.7072      2486
   macro avg     0.6685    0.6732    0.6693      2486
weighted avg     0.7063    0.7072    0.7052      2486

Test Data
              precision    recall  f1-score   support

          -1     0.6300    0.6058    0.6176       104
           0     0.6107    0.6835    0.6450       218
           1     0.8306    0.7903    0.8100       453

    accuracy                         0.7355       775
   macro avg     0.6904    0.6932    0.6909       775
weighted avg     0.7418    0.7355    0.7378       775



In [112]:
(212/2978)*0.4626 + (1754/2978)*0.8022 + (1012/2978)*0.6738

0.7343907320349228

In [17]:
(34/383)*0.5063  + (104/383)*0.5830 + (245/383)*0.7845

0.7050879895561357