In [1]:
import pandas as pd
import numpy as np

In [64]:
#train = pd.read_csv("train_newpreproc.csv",index_col=0)
#val = pd.read_csv("val_newpreproc.csv",index_col=0)
#test = pd.read_csv("test_newpreproc.csv",index_col=0)
train = pd.read_csv("train_oldpreproc.csv",index_col=0)
val = pd.read_csv("val_oldpreproc.csv",index_col=0)
test = pd.read_csv("test_oldpreproc.csv",index_col=0)
trainval = pd.concat([train,val],axis=0)

### VADER

Developed in 2014, VADER (Valence Aware Dictionary and sEntiment Reasoner) is a pre-trained model that uses rule-based values tuned to sentiments from social media. It evaluates the text of a message and gives you an assessment of not just positive and negative, but the intensity of that emotion as well.

It uses a dictionary of terms that it can evaluate. From the GitHub repository this includes examples like:

Negations - a modifier that reverses the meaning of a phrase ("not great").
Contractions - negations, but more complex ("wasn’t great").
Punctuation - increased intensity ("It’s great!!!").
Slang - variations of slang words such as "kinda", "sux", or "hella".
It's even able to understand acronyms ("lol") and emoji (❤).



In [65]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/xinminaw/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [66]:
trainval["polarity_scores"] = trainval.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores"] = test.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test.head()

Unnamed: 0,restaurant_code,review_title,account_name,new_aspect_1,phrase,phrase_lemma,phrase_stem,label,polarity_scores
0,328-katong-laksa-united-square,328 Katong Laksa; Now you don't have to travel...,Qing Xiang,food,katong laksa defeated gordon ramsay version la...,katong laksa defeated gordon ramsay version la...,katong laksa defeat gordon ramsay version laks...,1,"{'neg': 0.163, 'neu': 0.684, 'pos': 0.153, 'co..."
1,328-katong-laksa-united-square,Best Laksa In Singapore,Clara Choo,food,best laksa prawns fish cakes,best laksa prawn fish cake,best laksa prawn fish cake,1,"{'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'comp..."
2,328-katong-laksa-united-square,Decided to settle at [328 KATONG LAKSA] for lu...,Natalie Tan,food,nowhere comparable original east coast rd got ...,nowhere comparable original east coast rd got ...,nowher compar origin east coast rd got ta admi...,0,"{'neg': 0.125, 'neu': 0.511, 'pos': 0.364, 'co..."
3,7thheavenktvandcafe,For 1-for-1 Main Dish (save ~$20),Burpple Guides,food,sing heart content dining hearty western pulle...,sing heart content dining hearty western pulle...,sing heart content dine hearti western pull po...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,7thheavenktvandcafe,Grilled Chicken With Truffled Mash,E S,food,dine th heaven nicely grilled juicy tender,dine th heaven nicely grilled juicy tender,dine th heaven nice grill juici tender,1,"{'neg': 0.0, 'neu': 0.446, 'pos': 0.554, 'comp..."


In [67]:
trainval["compound"] = trainval["polarity_scores"].map(lambda score_dict : score_dict["compound"])
test["compound"] = test["polarity_scores"].map(lambda score_dict : score_dict["compound"])

In [68]:
trainval["prediction"] = trainval["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction"] = test["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [69]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction))
print("Test Data")
print(classification_report(test.label,test.prediction))

VADER Sentiment Analysis Model
TrainingValidation Data
              precision    recall  f1-score   support

          -1       0.23      0.21      0.22       125
           0       0.64      0.64      0.64       677
           1       0.72      0.73      0.73       762

    accuracy                           0.65      1564
   macro avg       0.53      0.53      0.53      1564
weighted avg       0.65      0.65      0.65      1564

Test Data
              precision    recall  f1-score   support

          -1       0.44      0.44      0.44        34
           0       0.47      0.63      0.54       104
           1       0.82      0.70      0.76       245

    accuracy                           0.66       383
   macro avg       0.58      0.59      0.58       383
weighted avg       0.69      0.66      0.67       383



In [70]:
print("TrainingValidation Data")
print(confusion_matrix(trainval.label,trainval.prediction))
print("Test Data")
print(confusion_matrix(test.label,test.prediction))

TrainingValidation Data
[[ 26  71  28]
 [ 56 432 189]
 [ 29 173 560]]
Test Data
[[ 15  11   8]
 [  8  66  30]
 [ 11  62 172]]


In [71]:
# Examine wrong class -1
trainval.loc[(trainval.label == -1) & (trainval.prediction != -1)].phrase.values

array(['popular coffee stall high expectations tad stale dry kaya slightly sweet',
       'set came pretty small pot soup rice', 'tender chicken meat dry',
       'promising taste cheese looking forward hoping nice stick ice cream',
       'pricey size',
       'got ala carte piece garlicky chicken time tad salty liking fried chicken time tad salty liking',
       'long waiting time', 'longer time', 'ham',
       'bound gripe waiting time pretty long min gripe waiting time pretty long min',
       'little', 'not half', 'minimal protein curries stews',
       'extra seven bucks', 'small bowl', 'gripe portion',
       'little bit sweeter preferred',
       'tasted greasy tasted like ketchup unnaturally flavourful soup nice sweet savoury sauce drizzled rice alongside',
       'small portions', 'plain plus sauce garnish', 'price',
       'look queue basic ingredients', 'little pricey',
       'portion small small', 'slight min wait ish min wait',
       'pork belly plus peppered chilli pow

##### Update Lexicon Dictionary (Round 1)

In [72]:
new_food = {
    "tender" : 4,
    "fresh" : 4,
    "soggy" : -4,
    "jelat" : -4,
    "oily" : -4,
    "overcooked" :-4,
    "dry" : -2,
    "disappointed" : -4  
}

new_time = {
    "long queue" : -4,
    "queue" : -4,
    "wait" : -2,
    "slow" : -4,
    "crowd" : -4
}

new_price = {
    "pricey" : -4,
    "expensive" : -4,
    "cheap" : 4,
    "worth" : 4,
    "overpriced" : -4,
    "not worth" : -4,
    "value for money" : 4
    
}

new_portion = {
    "small" : -4,
    "large" : 4,
    "generous" : 4,
}

In [73]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [74]:
trainval["polarity_scores1"] = trainval.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores1"] = test.phrase.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound1"] = trainval["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
test["compound1"] = test["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
trainval["prediction1"] = trainval["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction1"] = test["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [75]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction1))
print("Test Data")
print(classification_report(test.label,test.prediction1))

VADER Sentiment Analysis Model (Tuned with new words)
TrainingValidation Data
              precision    recall  f1-score   support

          -1       0.36      0.50      0.41       125
           0       0.68      0.60      0.64       677
           1       0.72      0.75      0.73       762

    accuracy                           0.67      1564
   macro avg       0.59      0.62      0.60      1564
weighted avg       0.68      0.67      0.67      1564

Test Data
              precision    recall  f1-score   support

          -1       0.42      0.59      0.49        34
           0       0.52      0.62      0.57       104
           1       0.82      0.71      0.76       245

    accuracy                           0.67       383
   macro avg       0.59      0.64      0.61       383
weighted avg       0.71      0.67      0.68       383



In [76]:
# Examine wrong class 1
trainval.loc[(trainval.label == 1) & (trainval.prediction1 != 1)].phrase.values

array(['went pig congee lo mai kai crystal dumplings total damage food photo rice normal sticky glutinous sourness smooth not salty sghawker sgfoodie time stick congee smooth not salty',
       'sizzling pop chicken available limited order tom yum chicken combined soy garlic hot sauce options form wingettes drumettes boneless bites friedchicken fingerlickinggood fourfingers',
       'stark resembelance packaging papa pahelta version ones espresso league',
       'big piece not',
       'black sauce chilli flavorful skin nicely crisped yellow rice touch dry',
       'fragrant feel generally refreshing serves quality craft beer',
       'affordable option',
       'scooping oil chili dark soya bowl ungracefully stained liquid taste way prawn noodles unassuming',
       'fried potato cubes nice bite oily duck meat',
       'omurice look photos reviews fried rice tomato disappointed serving not look photos',
       'affordable', 'sufficient portion sufficient portion',
       'spicy chipot

##### Update Lexicon Dictionary (Round 2)

In [77]:
new_food = {
    "tender" : 2,
    "fresh" : 2,
    "soggy" : -2,
    "jelat" : -2,
    "oily" : -2,
    "overcooked" :-2,
    "dry" : -2,
    "disappointed" : -2,
    "cravings satisfied" : 2,
    "crispy" : 2,
    "sinful" : 2,
    "tough" : -2,
    "cold" : -2
}

new_time = {
    "long queue" : -2,
    "queue" : -2,
    "wait" : -2,
    "slow" : -2,
    "crowd" : -2,
    "crowded" : -2,
    "no waiting time" : 2,
    "fast" : 2,
}

new_price = {
    "pricey" : -2,
    "expensive" : -2,
    "cheap" : 2,
    "worth" : 2,
    "overpriced" : -2,
    "not worth" : -2,
    "value for money" : 2,
    "reasonable" : 2,
    "reasonably" : 2,
    "affordable" : 2,
    "steal" : 2
    
}

new_portion = {
    "small" : -2,
    "large" : 2,
    "generous" : 2,
    "sufficient" : 1,
    "enough" : 1
}

In [78]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [79]:
trainval["polarity_scores2"] = trainval.phrase.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores2"] = test.phrase.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound2"] = trainval["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
test["compound2"] = test["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
trainval["prediction2"] = trainval["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction2"] = test["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [80]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words 2)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction2))
print("Test Data")
print(classification_report(test.label,test.prediction2))

VADER Sentiment Analysis Model (Tuned with new words 2)
TrainingValidation Data
              precision    recall  f1-score   support

          -1       0.36      0.48      0.41       125
           0       0.70      0.59      0.64       677
           1       0.72      0.78      0.75       762

    accuracy                           0.68      1564
   macro avg       0.60      0.62      0.60      1564
weighted avg       0.69      0.68      0.68      1564

Test Data
              precision    recall  f1-score   support

          -1       0.44      0.59      0.51        34
           0       0.55      0.62      0.58       104
           1       0.83      0.74      0.78       245

    accuracy                           0.70       383
   macro avg       0.61      0.65      0.62       383
weighted avg       0.72      0.70      0.71       383



In [81]:
(34/383)*0.51 + (104/383)*0.58 + (245/383)*0.78

0.7017232375979112