In [None]:
import pandas as pd
import numpy as np
import utils

### Read in Aggregated Data

In [None]:
train = pd.read_csv("data/labelled_data/train_newpreproc_emoticon.csv")
val = pd.read_csv("data/labelled_data/val_newpreproc_emoticon.csv")
test = pd.read_csv("data/labelled_data/test_newpreproc_emoticon.csv")
trainval = pd.concat([train,val],axis=0)

In [None]:
trainval.shape,test.shape

### VADER

Developed in 2014, VADER (Valence Aware Dictionary and sEntiment Reasoner) is a pre-trained model that uses rule-based values tuned to sentiments from social media. It evaluates the text of a message and gives you an assessment of not just positive and negative, but the intensity of that emotion as well.

It uses a dictionary of terms that it can evaluate. From the GitHub repository this includes examples like:

Negations - a modifier that reverses the meaning of a phrase ("not great").
Contractions - negations, but more complex ("wasn’t great").
Punctuation - increased intensity ("It’s great!!!").
Slang - variations of slang words such as "kinda", "sux", or "hella".
It's even able to understand acronyms ("lol") and emoji (❤).



In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [None]:
sid.polarity_scores("fresh")

In [None]:
trainval["polarity_scores"] = trainval.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores"] = test.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test.head()

In [None]:
trainval["compound"] = trainval["polarity_scores"].map(lambda score_dict : score_dict["compound"])
test["compound"] = test["polarity_scores"].map(lambda score_dict : score_dict["compound"])

In [None]:
trainval["prediction"] = trainval["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction"] = test["compound"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction,digits=4))

In [None]:
print("TrainingValidation Data")
print(confusion_matrix(trainval.label,trainval.prediction))
print("Test Data")
print(confusion_matrix(test.label,test.prediction))

In [None]:
# Examine wrong class -1
trainval.loc[(trainval.label == -1) & (trainval.prediction != -1)].phrase.values[:10]

##### Update Lexicon Dictionary (Round 1)

In [None]:
new_food = {
    "tender" : 4,
    "fresh" : 4,
    "soggy" : -4,
    "jelat" : -4,
    "oily" : -4,
    "overcooked" :-4,
    "dry" : -2,
    "disappointed" : -4  
}

new_time = {
    "long queue" : -4,
    "queue" : -4,
    "wait" : -2,
    "slow" : -4,
    "crowd" : -4
}

new_price = {
    "pricey" : -4,
    "expensive" : -4,
    "cheap" : 4,
    "worth" : 4,
    "overpriced" : -4,
    "not worth" : -4,
    "value for money" : 4
    
}

new_portion = {
    "small" : -4,
    "large" : 4,
    "generous" : 4,
}

In [None]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [None]:
trainval["polarity_scores1"] = trainval.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores1"] = test.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound1"] = trainval["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
test["compound1"] = test["polarity_scores1"].map(lambda score_dict : score_dict["compound"])
trainval["prediction1"] = trainval["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction1"] = test["compound1"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction1,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction1,digits=4))

In [None]:
# Examine wrong class 1
trainval.loc[(trainval.label == 1) & (trainval.prediction1 != 1)].phrase.values[:10]

##### Update Lexicon Dictionary (Round 2)

In [None]:
new_food = {
    "tender" : 2,
    "fresh" : 2,
    "soggy" : -2,
    "jelat" : -2,
    "oily" : -2,
    "overcooked" :-2,
    "dry" : -2,
    "disappointed" : -2,
    "cravings satisfied" : 2,
    "crispy" : 2,
    "sinful" : 2,
    "tough" : -2,
    "cold" : -2
}

new_time = {
    "long queue" : -2,
    "queue" : -2,
    "wait" : -2,
    "slow" : -2,
    "crowd" : -2,
    "crowded" : -2,
    "no waiting time" : 2,
    "fast" : 2,
}

new_price = {
    "pricey" : -2,
    "expensive" : -2,
    "cheap" : 2,
    "worth" : 2,
    "overpriced" : -2,
    "not worth" : -2,
    "value for money" : 2,
    "reasonable" : 2,
    "reasonably" : 2,
    "affordable" : 2,
    "steal" : 2   
}

new_portion = {
    "small" : -2,
    "large" : 2,
    "generous" : 2,
    "sufficient" : 1,
    "enough" : 1
}

In [None]:
sid.lexicon.update(new_food)
sid.lexicon.update(new_time)
sid.lexicon.update(new_price)
sid.lexicon.update(new_portion)

In [None]:
trainval["polarity_scores2"] = trainval.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
test["polarity_scores2"] = test.phrase_emoticon_generic.map(lambda phrase : sid.polarity_scores(phrase))
trainval["compound2"] = trainval["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
test["compound2"] = test["polarity_scores2"].map(lambda score_dict : score_dict["compound"])
trainval["prediction2"] = trainval["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)
test["prediction2"] = test["compound2"].map(lambda c: 1 if c >0 else 0 if c == 0 else -1)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

print("VADER Sentiment Analysis Model (Tuned with new words 2)")
print("TrainingValidation Data")
print(classification_report(trainval.label,trainval.prediction2,digits=4))
print("Test Data")
print(classification_report(test.label,test.prediction2,digits=4))

## KFold CV For Stacking

In [None]:
def custom_k_fold_VADER(column, data, model_name):
    
    # Generate fold predictions
    fold_num = 1
    for tf_combi in data:
        train = tf_combi[0].copy() # no training needed for VADER 
        predict_on = tf_combi[1] 

        # Get Labels
        train_label = train.label # not required
        
        # Fit Model
        train["polarity_scores"] = train[column].map(lambda phrase : sid.polarity_scores(phrase))
        train["pos"] = train["polarity_scores"].map(lambda score_dict : score_dict["pos"])
        train["neg"] = train["polarity_scores"].map(lambda score_dict : score_dict["neg"])

        # Create Dataframe and output
        df = pd.DataFrame(data=train[["neg","pos","label","new_aspect_1"]].values, columns = [model_name+'_prob_neg', model_name+'_prob_pos',"label","aspect"])
        # df.drop(columns= [model_name+'_prob_neu'])
        ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg',"label","aspect"]
        df=df[ordered_cols]
        if fold_num <=5:
            path = "data/fold_predictions/VADER/" + model_name + '_fold' + str(fold_num) +'.csv'
        else:
            path = "data/fold_predictions/VADER/" + model_name + '_test.csv'
        
        df.to_csv(path, index=False)
        
        fold_num +=1

In [None]:
# Import Data
fold1 = pd.read_csv('data/stacking_folds/fold1.csv')
fold2 = pd.read_csv('data/stacking_folds/fold2.csv')
fold3 = pd.read_csv('data/stacking_folds/fold3.csv')
fold4 = pd.read_csv('data/stacking_folds/fold4.csv')
fold5 = pd.read_csv('data/stacking_folds/fold5.csv')

train1 = pd.read_csv('data/stacking_folds/train1.csv')
train2 = pd.read_csv('data/stacking_folds/train2.csv')
train3 = pd.read_csv('data/stacking_folds/train3.csv')
train4 = pd.read_csv('data/stacking_folds/train4.csv')
train5 = pd.read_csv('data/stacking_folds/train5.csv')

train_all = pd.read_csv('data/stacking_folds/train_all.csv')
test = pd.read_csv('data/stacking_folds/test.csv')

# store in suitable data structure
data = [(fold1, fold1), (fold2, fold2),(fold3, fold3), (fold4, fold4), (fold5, fold5), (test, test)]

column = "phrase_emoticon_generic"

In [None]:
custom_k_fold_VADER(column=column, data=data, model_name="VADER")

### Train on all data

In [None]:
# save full model
full_df = pd.read_csv("data/stacking_folds/all_labelled_data.csv")

full_df["polarity_scores"] = full_df[column].map(lambda phrase : sid.polarity_scores(phrase))
full_df["pos"] = full_df["polarity_scores"].map(lambda score_dict : score_dict["pos"])
full_df["neg"] = full_df["polarity_scores"].map(lambda score_dict : score_dict["neg"])

model_name = "VADER"
# Create Dataframe and output
df = pd.DataFrame(data=full_df[["neg","pos","label"]].values, columns = [model_name+'_prob_neg', model_name+'_prob_pos',"label"])
# df.drop(columns= [model_name+'_prob_neu'])
ordered_cols = [model_name+'_prob_pos',model_name+'_prob_neg',"label"]
df=df[ordered_cols]

df.to_csv("data/fold_predictions/VADER/VADER_all.csv", index=False)