In [1]:
import pandas as pd
import numpy as np
import utils

### Read in Aggregated Data

In [3]:
train = pd.read_csv("new_labels/test_newpreproc_emoticon/train_newpreproc_emoticon.csv")
val = pd.read_csv("new_labels/test_newpreproc_emoticon/val_newpreproc_emoticon.csv")
test = pd.read_csv("new_labels/test_newpreproc_emoticon/test_newpreproc_emoticon.csv")
trainval = pd.concat([train,val],axis=0)

In [4]:
trainval.shape,test.shape

((2486, 15), (775, 15))

### VADER

Developed in 2014, VADER (Valence Aware Dictionary and sEntiment Reasoner) is a pre-trained model that uses rule-based values tuned to sentiments from social media. It evaluates the text of a message and gives you an assessment of not just positive and negative, but the intensity of that emotion as well.

It uses a dictionary of terms that it can evaluate. From the GitHub repository this includes examples like:

Negations - a modifier that reverses the meaning of a phrase ("not great").
Contractions - negations, but more complex ("wasn’t great").
Punctuation - increased intensity ("It’s great!!!").
Slang - variations of slang words such as "kinda", "sux", or "hella".
It's even able to understand acronyms ("lol") and emoji (❤).



In [5]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/xinminaw/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
sid.polarity_scores("fresh")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.3182}

In [7]:
trainval["polarity_scores"] = trainval.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores"] = test.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test.head()

Unnamed: 0.1,Unnamed: 0,restaurant_code,review_title,account_name,new_aspect_1,phrase,phrase_lemma,phrase_stem,phrase_emoticon_generic,phrase_emoticon_unique,phrase_stem_emoticon_generic,phrase_lemma_emoticon_generic,phrase_stem_emoticon_unique,phrase_lemma_emoticon_unique,label,polarity_scores
0,0,109-teochew-yong-tau-foo,Ampang YTF!,Pearlyn Chua,ambience,airconditioned price point loving place ca wai...,airconditioned price point loving place ca wai...,aircondit price point love place ca wait visit,airconditioned price point loving place ca wai...,airconditioned price point loving place ca wai...,aircondit price point love place ca wait visit,airconditioned price point loving place ca wai...,aircondit price point love place ca wait visit,airconditioned price point loving place ca wai...,1.0,"{'neg': 0.0, 'neu': 0.642, 'pos': 0.358, 'comp..."
1,1,109-teochew-yong-tau-foo,Ampang YTF!,Pearlyn Chua,food,soup dry laksa fried rice fried chicken wings ...,soup dry laksa fried rice fried chicken wing b...,soup dri laksa fri rice fri chicken wing brown...,soup dry laksa fried rice fried chicken wings ...,soup dry laksa fried rice fried chicken wings ...,soup dri laksa fri rice fri chicken wing brown...,soup dry laksa fried rice fried chicken wing b...,soup dri laksa fri rice fri chicken wing brown...,soup dry laksa fried rice fried chicken wing b...,1.0,"{'neg': 0.063, 'neu': 0.663, 'pos': 0.274, 'co..."
2,2,109-teochew-yong-tau-foo,Ampang YTF!,Pearlyn Chua,time,snaking queue intimidated long cos moves fairl...,snaking queue intimidated long co move fairly ...,snake queue intimid long co move fairli fast,snaking queue intimidated long cos moves fairl...,snaking queue intimidated long cos moves fairl...,snake queue intimid long co move fairli fast,snaking queue intimidated long co move fairly ...,snake queue intimid long co move fairli fast,snaking queue intimidated long co move fairly ...,-1.0,"{'neg': 0.293, 'neu': 0.707, 'pos': 0.0, 'comp..."
3,3,109-teochew-yong-tau-foo,Delicious Yong Tau Foo,Simple Foodie,food,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,yong tau foohidden circular road smack town sh...,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,4,109-teochew-yong-tau-foo,Delicious Yong Tau Foo,Simple Foodie,time,clear queue,clear queue,clear queue,clear queue,clear queue,clear queue,clear queue,clear queue,clear queue,1.0,"{'neg': 0.0, 'neu': 0.278, 'pos': 0.722, 'comp..."


In [8]:
trainval["compound"] = trainval["polarity_scores"].map(lambda score_dict : score_dict["compound"])
test["compound"] = test["polarity_scores"].map(lambda score_dict : score_dict["compound"])

In [9]:
trainval["prediction"] = trainval["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction"] = test["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [10]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction,digits=4))

VADER Sentiment Analysis Model
TrainingValidation Data
              precision    recall  f1-score   support

        -1.0     0.4911    0.3303    0.3950       333
         0.0     0.6190    0.6467    0.6325       917
         1.0     0.7500    0.7913    0.7701      1236

    accuracy                         0.6762      2486
   macro avg     0.6200    0.5894    0.5992      2486
weighted avg     0.6670    0.6762    0.6691      2486

Test Data
              precision    recall  f1-score   support

        -1.0     0.5263    0.3846    0.4444       104
         0.0     0.5532    0.7156    0.6240       218
         1.0     0.8129    0.7483    0.7793       453

    accuracy                         0.6903       775
   macro avg     0.6308    0.6162    0.6159       775
weighted avg     0.7014    0.6903    0.6907       775



In [11]:
print("TrainingValidation Data")
print(confusion_matrix(trainval.label,trainval.prediction))
print("Test Data")
print(confusion_matrix(test.label,test.prediction))

TrainingValidation Data
[[110 150  73]
 [ 71 593 253]
 [ 43 215 978]]
Test Data
[[ 40  33  31]
 [ 15 156  47]
 [ 21  93 339]]


In [23]:
# Examine wrong class -1
trainval.loc[(trainval.label == -1) & (trainval.prediction != -1)].phrase.values[:10]

array(['laksa approx small kind laksa eat',
       'laksa lemak rice noodles cut small pieces feels weird genius bowl otah otah laksa',
       'expect laksa taste good not exceptional sadly waste money taste good not exceptional sadly waste money',
       'small portion', 'felt kinda pricey kinda pricey portion',
       'small small bowl', 'air conditioning coild barely felt',
       'katong laksa bowl mediocre katong laksa', 'laksa laksa small',
       'expensive icecream'], dtype=object)

##### Update Lexicon Dictionary (Round 1)

In [13]:
new_food = {
    "tender" : 4,
    "fresh" : 4,
    "soggy" : -4,
    "jelat" : -4,
    "oily" : -4,
    "overcooked" :-4,
    "dry" : -2,
    "disappointed" : -4  
}

new_time = {
    "long queue" : -4,
    "queue" : -4,
    "wait" : -2,
    "slow" : -4,
    "crowd" : -4
}

new_price = {
    "pricey" : -4,
    "expensive" : -4,
    "cheap" : 4,
    "worth" : 4,
    "overpriced" : -4,
    "not worth" : -4,
    "value for money" : 4
    
}

new_portion = {
    "small" : -4,
    "large" : 4,
    "generous" : 4,
}

In [14]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [15]:
trainval["polarity_scores1"] = trainval.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores1"] = test.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound1"] = trainval["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
test["compound1"] = test["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
trainval["prediction1"] = trainval["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction1"] = test["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [16]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction1,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction1,digits=4))

VADER Sentiment Analysis Model (Tuned with new words)
TrainingValidation Data
              precision    recall  f1-score   support

        -1.0     0.5450    0.6366    0.5873       333
         0.0     0.6824    0.6140    0.6464       917
         1.0     0.7649    0.7872    0.7759      1236

    accuracy                         0.7031      2486
   macro avg     0.6641    0.6793    0.6699      2486
weighted avg     0.7050    0.7031    0.7029      2486

Test Data
              precision    recall  f1-score   support

        -1.0     0.5766    0.6154    0.5953       104
         0.0     0.5961    0.6972    0.6427       218
         1.0     0.8337    0.7528    0.7912       453

    accuracy                         0.7187       775
   macro avg     0.6688    0.6885    0.6764       775
weighted avg     0.7324    0.7187    0.7231       775



In [18]:
# Examine wrong class 1
trainval.loc[(trainval.label == 1) & (trainval.prediction1 != 1)].phrase.values[:10]

array(['instead went pig congee lo mai kai crystal dumplings total damage food photo rice normal sticky glutinous like not salty lomaikai porridge time stick congee smooth criminal bonus not salty',
       'laksadelicious laksa', 'cravings laksa goes signature katong',
       'laksa onsen egglovely edition egg', 'big laksa',
       'waiting time no waiting waiting time no waiting',
       'friday pig intestines big', 'good coffee sg good coffee sg',
       'affordable utilising burpple',
       'clay pot medium sauce dried chili slices ginger cloves mushy garlic heavenly supposed soggy decent dish overall'],
      dtype=object)

##### Update Lexicon Dictionary (Round 2)

In [19]:
new_food = {
    "tender" : 2,
    "fresh" : 2,
    "soggy" : -2,
    "jelat" : -2,
    "oily" : -2,
    "overcooked" :-2,
    "dry" : -2,
    "disappointed" : -2,
    "cravings satisfied" : 2,
    "crispy" : 2,
    "sinful" : 2,
    "tough" : -2,
    "cold" : -2
}

new_time = {
    "long queue" : -2,
    "queue" : -2,
    "wait" : -2,
    "slow" : -2,
    "crowd" : -2,
    "crowded" : -2,
    "no waiting time" : 2,
    "fast" : 2,
}

new_price = {
    "pricey" : -2,
    "expensive" : -2,
    "cheap" : 2,
    "worth" : 2,
    "overpriced" : -2,
    "not worth" : -2,
    "value for money" : 2,
    "reasonable" : 2,
    "reasonably" : 2,
    "affordable" : 2,
    "steal" : 2   
}

new_portion = {
    "small" : -2,
    "large" : 2,
    "generous" : 2,
    "sufficient" : 1,
    "enough" : 1
}

In [20]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [21]:
trainval["polarity_scores2"] = trainval.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores2"] = test.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound2"] = trainval["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
test["compound2"] = test["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
trainval["prediction2"] = trainval["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction2"] = test["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [22]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words 2)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction2,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction2,digits=4))

VADER Sentiment Analysis Model (Tuned with new words 2)
TrainingValidation Data
              precision    recall  f1-score   support

        -1.0     0.5562    0.6096    0.5817       333
         0.0     0.7009    0.6031    0.6483       917
         1.0     0.7605    0.8196    0.7889      1236

    accuracy                         0.7116      2486
   macro avg     0.6725    0.6774    0.6730      2486
weighted avg     0.7111    0.7116    0.7093      2486

Test Data
              precision    recall  f1-score   support

        -1.0     0.6400    0.6154    0.6275       104
         0.0     0.6234    0.6835    0.6521       218
         1.0     0.8349    0.8035    0.8189       453

    accuracy                         0.7445       775
   macro avg     0.6994    0.7008    0.6995       775
weighted avg     0.7492    0.7445    0.7463       775



## KFold CV For Stacking

In [25]:
def custom_k_fold_VADER(column, data, model_name):
    
    # Generate fold predictions
    fold_num = 1
    for tf_combi in data:
        train = tf_combi[0].copy() # no training needed for VADER 
        predict_on = tf_combi[1] 

        # Get Labels
        train_label = train.label # not required
        
        # Fit Model
        train["polarity_scores"] = train[column].map(lambda phrase : sid.polarity_scores(phrase))
        train["pos"] = train["polarity_scores"].map(lambda score_dict : score_dict["pos"])
        train["neg"] = train["polarity_scores"].map(lambda score_dict : score_dict["neg"])

        # Create Dataframe and output
        df = pd.DataFrame(data=train[["neg","pos","label","new_aspect_1"]].values, columns = [model_name+'_prob_neg', model_name+'_prob_pos',"label","aspect"])
        # df.drop(columns= [model_name+'_prob_neu'])
        ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg',"label","aspect"]
        df=df[ordered_cols]
        if fold_num <=5:
            path = "kfold/" + model_name + '_fold' + str(fold_num) +'.csv'
        else:
            path = "kfold/" + model_name + '_test.csv'
        
        df.to_csv(path, index=False)
        
        fold_num +=1

In [26]:
# Import Data
fold1 = pd.read_csv('stacking_folds/fold1.csv')
fold2 = pd.read_csv('stacking_folds/fold2.csv')
fold3 = pd.read_csv('stacking_folds/fold3.csv')
fold4 = pd.read_csv('stacking_folds/fold4.csv')
fold5 = pd.read_csv('stacking_folds/fold5.csv')

# VADER DONT NEED
train1 = pd.read_csv('stacking_folds/train1.csv')
train2 = pd.read_csv('stacking_folds/train2.csv')
train3 = pd.read_csv('stacking_folds/train3.csv')
train4 = pd.read_csv('stacking_folds/train4.csv')
train5 = pd.read_csv('stacking_folds/train5.csv')

# VADER DONT NEED
train_all = pd.read_csv('stacking_folds/train_all.csv')
test = pd.read_csv('stacking_folds/test.csv')

# store in suitable data structure
data = [(fold1, fold1), (fold2, fold2),(fold3, fold3), (fold4, fold4), (fold5, fold5), (test, test)]

column = "phrase_emoticon_generic"

In [27]:
custom_k_fold_VADER(column=column, data=data, model_name="VADER")

### Train on all data

In [312]:
# save full model
full_df = pd.read_csv("new_labels/ALL_LABELLED_DATA.csv")

full_df["polarity_scores"] = full_df[column].map(lambda phrase : sid.polarity_scores(phrase))
full_df["pos"] = full_df["polarity_scores"].map(lambda score_dict : score_dict["pos"])
full_df["neg"] = full_df["polarity_scores"].map(lambda score_dict : score_dict["neg"])

model_name = "VADER"
# Create Dataframe and output
df = pd.DataFrame(data=full_df[["neg","pos","label"]].values, columns = [model_name+'_prob_neg', model_name+'_prob_pos',"label"])
# df.drop(columns= [model_name+'_prob_neu'])
ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg',"label"]
df=df[ordered_cols]

df.to_csv("fold_predictions/VADER/VADER_all.csv", index=False)