In [None]:
!git clone https://github.com/hapy-ditto/HypEmo.git

Cloning into 'HypEmo'...
remote: Enumerating objects: 106, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 106 (delta 4), reused 1 (delta 0), pack-reused 96[K
Receiving objects: 100% (106/106), 3.14 MiB | 22.67 MiB/s, done.
Resolving deltas: 100% (20/20), done.


In [None]:
import pandas as pd
output = pd.read_csv('/content/HypEmo/output.csv')

In [None]:
test_data = pd.read_csv('/content/HypEmo/data/go_emotion/test.csv')

In [None]:
test_data.head()

Unnamed: 0,text,aug_text,label
0,i’m really sorry about your situation :( altho...,"i'm very sorry about your situation, though i ...",25
1,it's wonderful because it's awful. at not with.,it's wonderful because it's scary.,0
2,"kings fan here, good luck to you guys! will be...","good luck, king's fans!",13
3,"i didn't know that, thank you for teaching me ...","i don't know, thanks for teaching me something...",15
4,thank you for asking questions and recognizing...,and i admit that there are things you may not ...,15


In [None]:
test_data.shape

(2984, 3)

In [None]:
output.shape

(2984, 2)

In [None]:
output = pd.concat([test_data[['text', 'aug_text']], output], axis=1)

# Reset the index if needed
output.reset_index(drop=True, inplace=True)

In [None]:
output.head()

Unnamed: 0,text,aug_text,predict,output
0,i’m really sorry about your situation :( altho...,"i'm very sorry about your situation, though i ...",24,25
1,it's wonderful because it's awful. at not with.,it's wonderful because it's scary.,0,0
2,"kings fan here, good luck to you guys! will be...","good luck, king's fans!",13,13
3,"i didn't know that, thank you for teaching me ...","i don't know, thanks for teaching me something...",15,15
4,thank you for asking questions and recognizing...,and i admit that there are things you may not ...,15,15


## Error Analysis

In [None]:
output.columns

Index(['text', 'aug_text', 'predict', 'output'], dtype='object')

In [None]:
output.to_csv('hypmo_output_analysis.csv')

In [None]:
label2idx = {'admiration': 0, 'amusement': 1, 'anger': 2,
      'annoyance': 3, 'approval': 4, 'caring': 5,
      'confusion': 6, 'curiosity': 7, 'desire': 8,
      'disappointment': 9, 'disapproval': 10, 'disgust': 11,
      'embarrassment': 12, 'excitement': 13, 'fear': 14,
      'gratitude': 15, 'grief': 16, 'joy': 17,
      'love': 18, 'nervousness': 19, 'optimism': 20,
      'pride': 21, 'realization': 22, 'relief': 23,
      'remorse': 24, 'sadness': 25, 'surprise': 26}
idx2label = {}
for key, val in label2idx.items():
  idx2label[val] = key
output['predict_label'] = output['predict'].replace(idx2label)
output['true_label'] = output['output'].replace(idx2label)

### Analyze Using Goemotions Taxonomy

In [None]:
from sklearn.metrics import f1_score, recall_score
labels = label2idx.keys()  # Get unique labels

f1_scores = []
recall_scores = []

for label in labels:
  true_labels = (output['true_label'] == label).astype(int)
  predicted_labels = (output['predict_label'] == label).astype(int)

  f1 = f1_score(true_labels, predicted_labels)
  recall = recall_score(true_labels, predicted_labels)

  f1_scores.append(f1)
  recall_scores.append(recall)

In [None]:
output[output['true_label'] == 'grief']

Unnamed: 0,text,aug_text,predict,output,predict_label,true_label
419,[name] death is just so..... senseless. why? w...,why why why,6,16,confusion,grief
1676,rip the guy from psych,rip that guy out of his mind.,2,16,anger,grief


In [None]:
result_df = pd.DataFrame({'Label': labels, 'F1_Score': f1_scores, 'Recall': recall_scores})

In [None]:
result_df

Unnamed: 0,Label,F1_Score,Recall
0,admiration,0.732782,0.764368
1,amusement,0.833787,0.822581
2,anger,0.514768,0.465649
3,annoyance,0.354312,0.391753
4,approval,0.470588,0.440678
5,caring,0.477778,0.5
6,confusion,0.392523,0.43299
7,curiosity,0.640212,0.6875
8,desire,0.465116,0.357143
9,disappointment,0.359551,0.363636


In [None]:
output[output['predict_label'] == 'pride']

Unnamed: 0,text,aug_text,predict,output,predict_label,true_label
89,proud of you.,i'm proud of you.,21,0,pride,admiration
396,i'm flattered but i'm a good [religion] scarf,"i'm flattered, but i'm a good scarf.",21,0,pride,admiration
1011,"eh, says who? anyway, i like being by myself. ...","anyway, i like being alone. i'm alone now.",21,7,pride,curiosity
2563,"boy what an accomplishment, so proud!","boy, what an achievement, so proud!",21,21,pride,pride
2703,too damn often!! and i’m cute as hell! i’m mad!!,i'm so cute!,21,3,pride,annoyance
2884,i am proud to be racist no one in real life wi...,"i'm proud to be a racist, and no one in real l...",21,21,pride,pride
2906,i have more faith in [name] than anyone on the...,i believe in three strange worlds more than an...,21,20,pride,optimism


### Analyze Using Ekman's Grouping Method

In [None]:
senti2label = {
"anger": ["anger", "annoyance", "disapproval"],
"disgust": ["disgust"],
"fear": ["fear", "nervousness"],
"joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
"surprise": ["surprise", "realization", "confusion", "curiosity"]
}
label2senti = {}
for key, vals in senti2label.items():
  for val in vals:
    label2senti[val] = key

output['predict_senti'] = output['predict_label'].replace(label2senti)
output['true_senti'] = output['true_label'].replace(label2senti)

In [None]:
labels = senti2label.keys()  # Get unique labels

f1_scores = []
recall_scores = []

for label in labels:
  true_labels = (output['true_senti'] == label).astype(int)
  predicted_labels = (output['predict_senti'] == label).astype(int)

  f1 = f1_score(true_labels, predicted_labels)
  recall = recall_score(true_labels, predicted_labels)

  f1_scores.append(f1)
  recall_scores.append(recall)

In [None]:
result_df = pd.DataFrame({'Label': labels, 'F1_Score': f1_scores, 'Recall': recall_scores})
result_df

Unnamed: 0,Label,F1_Score,Recall
0,anger,0.654187,0.638462
1,disgust,0.536232,0.486842
2,fear,0.75,0.74026
3,joy,0.883575,0.875858
4,sadness,0.660342,0.671815
5,surprise,0.665971,0.710468


## Analyze Using Sentiment Taxomony

In [None]:
group2label = {
"positive": ["amusement", "excitement", "joy", "love", "desire", "optimism", "caring", "pride", "admiration", "gratitude", "relief", "approval"],
"negative": ["fear", "nervousness", "remorse", "embarrassment", "disappointment", "sadness", "grief", "disgust", "anger", "annoyance", "disapproval"],
"ambiguous": ["realization", "surprise", "curiosity", "confusion"]
}

In [None]:
label2group = {}
for key, vals in group2label.items():
  for val in vals:
    label2group[val] = key

In [None]:
output['predict_group'] = output['predict_label'].replace(label2group)
output['true_group'] = output['true_label'].replace(label2group)

In [None]:
labels = group2label.keys()  # Get unique labels

f1_scores = []
recall_scores = []

for label in labels:
  true_labels = (output['true_group'] == label).astype(int)
  predicted_labels = (output['predict_group'] == label).astype(int)

  f1 = f1_score(true_labels, predicted_labels)
  recall = recall_score(true_labels, predicted_labels)

  f1_scores.append(f1)
  recall_scores.append(recall)

In [None]:
result_df = pd.DataFrame({'Label': labels, 'F1_Score': f1_scores, 'Recall': recall_scores})
result_df

Unnamed: 0,Label,F1_Score,Recall
0,positive,0.883575,0.875858
1,negative,0.796943,0.783262
2,ambiguous,0.665971,0.710468


### Analyze the correlation between training data size and F1-score

In [None]:
train_data = pd.read_csv('/content/HypEmo/data/go_emotion/train.csv')
train_data['true_label'] = train_data['label'].replace(idx2label)
train_goe_label = train_data.groupby('true_label').size().to_frame(name='counts').reset_index()

In [None]:
train_data[train_data['true_label'] == 'admiration']

Unnamed: 0,text,aug_text,label,true_label
5,damn youtube and outrage drama is super lucrat...,and the angry play is a super-profit re-examin...,0,admiration
18,famous for his 3-4 defense,he's famous for his 3-4 defense.,0,admiration
28,twilight... still a better love story than the...,dawn... is still a better love story than the ...,0,admiration
30,what a wonderful world,what a wonderful world.,0,admiration
44,you just make her sound awesome.,you just make her sound awesome.,0,admiration
...,...,...,...,...
23447,kirkland liquor is amazing. i can get a cheap ...,"i could buy a cheap bottle of vodka or scotch,...",0,admiration
23468,wow nice way to live in an echo chamber.,wow. that's a good way to live in the echo room.,0,admiration
23470,"even when it's bad, it's still pretty good","even if it's bad, it's not bad.",0,admiration
23471,this is my favorite reddit conversation i’ve s...,this is my favorite redacted conversation.,0,admiration


In [None]:
labels = label2idx.keys()
f1_scores = []
recall_scores = []

for label in labels:
  true_labels = (output['true_label'] == label).astype(int)
  predicted_labels = (output['predict_label'] == label).astype(int)

  f1 = f1_score(true_labels, predicted_labels)
  recall = recall_score(true_labels, predicted_labels)

  f1_scores.append(f1)
  recall_scores.append(recall)
result_df = pd.DataFrame({'Label': labels, 'F1_Score': f1_scores, 'Recall': recall_scores})

In [None]:
result = pd.merge(train_goe_label, result_df, left_on='true_label', right_on='Label')

In [None]:
result

Unnamed: 0,true_label,counts,Label,F1_Score,Recall
0,admiration,2710,admiration,0.732782,0.764368
1,amusement,1652,amusement,0.833787,0.822581
2,anger,1025,anger,0.514768,0.465649
3,annoyance,1451,annoyance,0.354312,0.391753
4,approval,1873,approval,0.470588,0.440678
5,caring,649,caring,0.477778,0.5
6,confusion,858,confusion,0.392523,0.43299
7,curiosity,1389,curiosity,0.640212,0.6875
8,desire,389,desire,0.465116,0.357143
9,disappointment,709,disappointment,0.359551,0.363636
