## Computing sentiment and emotion vectors

### Language Model based approach: FEEL-IT

In [11]:
# santiy check

from transformers import pipeline
import pandas as pd
import re

emo_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-emotion',top_k=4)
sent_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-sentiment',top_k=2)

text_examples = [
    "sono felice",
    "La deforestazione è un male",
    "Questo è spaventoso",
    "Questa è una mela marcia",
    "mostro nero?",
    "lei è una bellezza terribile",
    "L'Cleanup Ocean Ã¨ un movimento internazionale che si Ã¨ formato per combattere la pollution del mare.",
    """La (tigre) 

    bianca Ã¨ una rara variante genetica della tigre reale (Panthera tigris), 
    
    caratterizzata da una particolare colorazione del mantello causata da una mutazione genetica. Questi bellissimi felini sono spesso oggetto di ammirazione e curiositÃ  per la loro singolare bellezza. Vive principalmente in India e in alcune parti del sud-est asiatico."""
]

def clean_text(text):
    text = re.sub("(\s?\n){1,}", ".", text) # remove new lines and replace with .
    text = re.sub("\t{1,}", " ", text) # remove tab spaces and replace with a singular space
    text = re.sub("[^\w\-(\.{1})'\!\?]", " ", text) # remove non-alphanumeric symbols, except for ., ', !, -
    text = re.sub("[\(\)\[\]\{\}]", "", text) # remove brackets of any kind
    text = re.sub("\s{2,}", " ", text) # remove any multiple white spaces
    text = text.strip() # remove any leading or ending white spaces
    return text

cleaned_text_examples = [clean_text(text) for text in text_examples]

emo_scores = emo_classifier(cleaned_text_examples)
sent_scores = sent_classifier(cleaned_text_examples)

EP_semantic_results = {
    "text": [],
    "cleaned_text": [],
    "positive": [],
    "negative": [],
    "sent_sum": [],
    "joy": [],
    "sadness": [],
    "anger": [],
    "fear": [],
    "emo_sum": []
}
# print(emo_scores)


for text, cleaned_text, sent_score, emo_score in zip(text_examples, cleaned_text_examples, sent_scores, emo_scores):
    EP_semantic_results["text"].append(text)
    EP_semantic_results["cleaned_text"].append(cleaned_text)
    sent_sum = 0
    emo_sum = 0
    for sent_dict in sent_score:
        EP_semantic_results[sent_dict["label"]].append(sent_dict["score"]*100)
        sent_sum += sent_dict["score"]
    EP_semantic_results["sent_sum"].append(sent_sum)
    for emo_dict in emo_score:
        EP_semantic_results[emo_dict["label"]].append(emo_dict["score"]*100)
        emo_sum += emo_dict["score"]
    EP_semantic_results["emo_sum"].append(emo_sum)

# print(EP_semantic_results)
df = pd.DataFrame.from_dict(EP_semantic_results)
df

Device set to use cpu
Device set to use cpu


Unnamed: 0,text,cleaned_text,positive,negative,sent_sum,joy,sadness,anger,fear,emo_sum
0,sono felice,sono felice,99.9726,0.027401,1.0,99.90108,0.041839,0.017075,0.04,1.0
1,La deforestazione è un male,La deforestazione è un male,0.021573,99.978429,1.0,0.020934,92.929733,6.870111,0.179221,1.0
2,Questo è spaventoso,Questo è spaventoso,0.02175,99.97825,1.0,0.100756,0.10274,0.093424,99.703085,1.0
3,Questa è una mela marcia,Questa è una mela marcia,0.02188,99.978119,1.0,0.04351,70.56402,29.189795,0.202674,1.0
4,mostro nero?,mostro nero?,0.023057,99.976939,1.0,0.033823,3.354666,94.834334,1.777177,1.0
5,lei è una bellezza terribile,lei è una bellezza terribile,0.022565,99.97744,1.0,99.864703,0.107149,0.014439,0.013711,1.0
6,L'Cleanup Ocean Ã¨ un movimento internazionale...,L'Cleanup Ocean Ã un movimento internazionale ...,73.182118,26.817882,1.0,18.575227,48.435813,0.285598,32.70337,1.0
7,La (tigre) \n\n bianca Ã¨ una rara variante...,La tigre. bianca Ã una rara variante genetica ...,99.962687,0.037313,1.0,99.923551,0.029315,0.010889,0.036246,1.0


In [12]:
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm
import re

tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
emo_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-emotion',top_k=4)
sent_classifier = pipeline("text-classification",model='MilaNLProc/feel-it-italian-sentiment',top_k=2)


def clean_text(text):
    text = re.sub("(\s?\n){1,}", ".", text) # remove new lines and replace with .
    text = re.sub("\t{1,}", " ", text) # remove tab spaces and replace with a singular space
    text = re.sub("[^\w\-(\.{1})'\!\?]", " ", text) # remove non-alphanumeric symbols, except for ., ', !, -
    text = re.sub("[\(\)\[\]\{\}]", "", text) # remove brackets of any kind
    text = re.sub("\s{2,}", " ", text) # remove any multiple white spaces
    text = text.strip() # remove any leading or ending white spaces
    return text

def get_semantic_EP(texts, text_name):
    cleaned_texts = [clean_text(text) for text in texts]
    cleaned_text_name = text_name+"_cleaned"
    try:
        emo_scores = emo_classifier(cleaned_texts)
        sent_scores = sent_classifier(cleaned_texts)
    except:
        emo_scores = emo_classifier(cleaned_texts, **tokenizer_kwargs)
        sent_scores = sent_classifier(cleaned_texts, **tokenizer_kwargs)

    EP_semantic_results = {
        cleaned_text_name: [],
        "positive": [],
        "negative": [],
        "joy": [],
        "sadness": [],
        "anger": [],
        "fear": [],
    }

    for cleaned_text, sent_score, emo_score in tqdm(zip(cleaned_texts, sent_scores, emo_scores), total=len(cleaned_texts)):
        EP_semantic_results[cleaned_text_name].append(cleaned_text)
        for sent_dict in sent_score:
            EP_semantic_results[sent_dict["label"]].append(sent_dict["score"]*100)
        for emo_dict in emo_score:
            EP_semantic_results[emo_dict["label"]].append(emo_dict["score"]*100)
    df = pd.DataFrame.from_dict(EP_semantic_results)
    return cleaned_text_name, df

Device set to use cpu
Device set to use cpu


In [None]:
InsideOut = pd.read_csv("../Data/InsideOutData_compiled.csv")

col_name, IO_df = get_semantic_EP(InsideOut["Resp"].tolist(), "Resp")
IO_df["QID"] = InsideOut["QID"]
IO_df["rank"] = InsideOut["Rank"]
IO_df["Task Sentiment"] = InsideOut["Task Sentiment"]
IO_df["IAS"] = ["Bing"]*len(InsideOut)
IO_df = IO_df.loc[:, ["QID", "Task Sentiment", "IAS", "rank", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
IO_df.to_csv("../Results/FEELIT_queryWise_InsidOutEP.csv", index=False)
IO_df.head()

  0%|          | 0/1739 [00:00<?, ?it/s]

Unnamed: 0,QID,Task Sentiment,IAS,rank,Resp_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,,Bing,1,Tornado - Wikipedia Tornado - Wikipedia Tornad...,99.909377,0.090624,11.563355,86.946112,0.490834,0.999694
1,qGEN2,,Bing,2,Extreme up-close video of tornado near Wray CO...,99.975449,0.024549,99.937147,0.027813,0.013232,0.021811
2,qGEN3,,Bing,3,Tornado facts and information. Tornadoes are v...,99.969923,0.030083,99.896598,0.048017,0.022982,0.032408
3,qGEN4,,Bing,4,Tornado Central - weather.com. Tornado Season ...,0.666457,99.333537,2.168188,84.036607,0.040767,13.754435
4,qGEN5,,Bing,5,Tornadoes Ready.gov. Tornadoes can destroy bui...,0.089091,99.910909,0.272096,0.116199,99.456948,0.154748


In [None]:
query_file_name = ""
query_file_path = "../Data/"+query_file_name

queries = pd.read_csv(query_file_path)
col_name, queries_df = get_semantic_EP(queries["Query"].tolist(), "Query")
queries_df["QID"] = queries["QID"]
queries_df = queries_df.loc[:, ["QID", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
queries_df.to_csv("../Results/FEELIT_queryWise_queryEP.csv", index=False)
queries_df.head()

  0%|          | 0/293 [00:00<?, ?it/s]

Unnamed: 0,QID,Query_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,tornado,0.058053,99.941945,0.071032,99.501771,0.37492,0.05228
1,qGEN2,tornado,0.058053,99.941945,0.071032,99.501771,0.37492,0.05228
2,qGEN3,piramide egizia,0.035509,99.964488,18.79262,1.731412,11.087692,68.388271
3,qGEN4,piramidi,99.916172,0.083829,94.845456,2.796574,0.477019,1.88095
4,qGEN5,qual Ã la piramide egizia piÃ¹ alta,99.960083,0.03992,99.873513,0.039468,0.023573,0.063449


In [None]:
gemma_resp = pd.read_csv("../Data/gemma_resp.csv")
col_name, gemma_df = get_semantic_EP(gemma_resp["Resp"].tolist(), "Resp")
gemma_df["QID"] = gemma_resp["QID"]
gemma_df["IAS"] = ["Gemma"]*len(gemma_resp)
gemma_df = gemma_df.loc[:, ["QID", "IAS", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
gemma_df.to_csv("../Results/FEELIT_queryWise_gemmaEP.csv", index=False)
gemma_df.head()

  0%|          | 0/293 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,Resp_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,Gemma,Il tornado è un'enorme tempesta atmosferica co...,18.078068,81.921935,83.850652,5.296346,0.019608,10.833394
1,qGEN2,Gemma,Il tornado è un'enorme tempesta atmosferica co...,18.078068,81.921935,83.850652,5.296346,0.019608,10.833394
2,qGEN3,Gemma,La piramide egizia è un monumento funerario di...,99.94092,0.059085,93.032485,2.6862,0.483063,3.798253
3,qGEN4,Gemma,Le piramidi sono strutture monumentali che si ...,99.890006,0.109995,99.629623,0.133649,0.037645,0.199084
4,qGEN5,Gemma,La piramide più alta dell'antico Egitto è la P...,99.952531,0.047469,99.833399,0.034575,0.03203,0.1


In [None]:
gpt_resp = pd.read_csv("../Data/gpt_resp.csv")
col_name, gpt_df = get_semantic_EP(gpt_resp["Resp"].tolist(), "Resp")
gpt_df["QID"] = gpt_resp["QID"]
gpt_df["IAS"] = ["GPT"]*len(gpt_resp)
gpt_df = gpt_df.loc[:, ["QID", "IAS", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
gpt_df.to_csv("../Results/FEELIT_queryWise_gptEP.csv", index=False)
gpt_df.head()

  0%|          | 0/293 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,Resp_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,GPT,Un tornado è un violento vortice d'aria in rot...,0.023756,99.976248,0.056199,0.318655,0.10993,99.515212
1,qGEN2,GPT,Il termine tornado si riferisce a un violento ...,0.074121,99.925882,0.098063,0.185901,0.073591,99.64245
2,qGEN3,GPT,Le piramidi egizie sono monumenti funerari cos...,99.842525,0.15748,99.892104,0.040411,0.013365,0.054116
3,qGEN4,GPT,Le piramidi sono strutture architettoniche di ...,99.95808,0.041924,99.918371,0.033479,0.011711,0.03643
4,qGEN5,GPT,La piramide egizia più alta è la Grande Pirami...,99.972552,0.027448,99.938476,0.025319,0.012664,0.023536


In [None]:
bing_resp = pd.read_csv("../Data/bing_resp.csv")
col_name, bing_df = get_semantic_EP(bing_resp["Resp"].tolist(), "Resp")
bing_df["QID"] = bing_resp["QID"]
bing_df["rank"] = bing_resp["Rank"]
bing_df["IAS"] = ["Bing"]*len(bing_resp)
bing_df = bing_df.loc[:, ["QID", "IAS", "rank", col_name, "positive", "negative", "joy", "sadness", "anger", "fear"]]
bing_df.to_csv("../Results/FEELIT_queryWise_bingEP.csv", index=False)
bing_df.head()

  0%|          | 0/2925 [00:00<?, ?it/s]

Unnamed: 0,QID,IAS,rank,Resp_cleaned,positive,negative,joy,sadness,anger,fear
0,qGEN1,Bing,1,Tromba d'aria - Wikipedia. Una tromba d'aria o...,0.043484,99.956518,0.647116,5.918216,0.050854,93.383813
1,qGEN1,Bing,2,Cos'è un tornado? Come si forma? - Ilmeteo.net...,0.026361,99.973637,0.169292,0.259403,0.040697,99.530607
2,qGEN1,Bing,3,Che cosa sono e come nascono i tornado? - Focu...,0.029268,99.970728,0.216355,0.614346,0.035964,99.133331
3,qGEN1,Bing,4,Trombe d aria e tornado cosa sono come si svil...,0.219933,99.780065,98.734915,0.069436,0.023516,1.172134
4,qGEN1,Bing,5,Trombe d aria e tornado cosa sono perché nasco...,0.020265,99.97974,0.018684,94.916797,2.67479,2.389724
