# GoEmotions

In [1]:
%pip install datasets pandas numpy scikit-learn tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
GOEMOTION_TO_8 = {
    "admiration": "joy",
    "amusement": "joy",
    "approval": "acceptance",
    "caring": "acceptance",
    "desire": "anticipation",
    "optimism": "anticipation",
    "excitement": "anticipation",

    "anger": "anger",
    "annoyance": "anger",
    "disapproval": "anger",

    "disgust": "disgust",

    "fear": "fear",
    "nervousness": "fear",

    "joy": "joy",
    "love": "joy",
    "gratitude": "joy",
    "pride": "joy",
    "relief": "joy",

    "sadness": "sadness",
    "grief": "sadness",
    "remorse": "sadness",
    "disappointment": "sadness",

    "surprise": "surprise",
    "realization": "surprise",
    "confusion": "surprise",

    "neutral": None
}


In [3]:
from datasets import load_dataset
import pandas as pd
import numpy as np

dataset = load_dataset("go_emotions")

# Use validation split for evaluation
data = dataset["validation"]

# Load label names
label_names = dataset["train"].features["labels"].feature.names


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
EMOTIONS_8 = [
    "acceptance",
    "anger",
    "anticipation",
    "disgust",
    "fear",
    "joy",
    "sadness",
    "surprise"
]

def labels_to_8dim(label_indices):
    gt = {e: 0 for e in EMOTIONS_8}

    for idx in label_indices:
        label = label_names[idx]
        mapped = GOEMOTION_TO_8.get(label)
        if mapped:
            gt[mapped] = 1
    return gt


In [5]:
import os 
import ast
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
api_key = os.environ.get("API_KEY")
client = OpenAI(api_key=api_key, base_url="https://api.groq.com/openai/v1", timeout=10)

def emotional_embedder(user_query):
    chat_completion = client.chat.completions.create(
    model="llama-3.3-70b-versatile",
    messages=[
        {
            "role": "system",
            "content": (
                "You are a master of sentiment analysis. Carefully discern the subtle emotions "
                "underlying each interviewer's question. Analyze questions across 8 dimensions: "
                "acceptance, anger, anticipation, disgust, fear, joy, sadness, surprise. "
                "Score each from 1-10. Your answer must be a valid python list so that it can "
                "be parsed directly, with no extra content! Format: "
                "[{\"analysis\": <REASON>, \"dim\": \"joy\", \"score\": <SCORE>}, ...]"
            )
        },
        {
            "role": "user",
            "content": user_query
        }
    ],
    temperature=0.1,
    max_tokens=1024,
    stream=False,
    stop=None,
    )

    raw = chat_completion.choices[0].message.content

    try:
        return ast.literal_eval(raw)
    except Exception as e:
        print("PARSE ERROR:", e)
        print("RAW OUTPUT:", raw)
        return None

In [6]:
def emotional_embedder_scores(user_query):
    result = emotional_embedder(user_query)

    # Convert list of dicts → dict
    scores = {item["dim"]: item["score"] for item in result}

    # Ensure all 8 dimensions exist
    return {e: scores.get(e, 0) for e in EMOTIONS_8}


In [7]:
def normalize_scores(score_dict, threshold=5):
    return {
        k: 1 if v >= threshold else 0
        for k, v in score_dict.items()
    }


In [8]:
from tqdm import tqdm
import random

N_SAMPLES = 100
SEED = 42

random.seed(SEED)

indices = random.sample(range(len(data)), N_SAMPLES)

results = []

for i in tqdm(indices):
    text = data[i]["text"]
    labels = data[i]["labels"]

    gt = labels_to_8dim(labels)
    pred_scores = emotional_embedder_scores(text)
    pred = normalize_scores(pred_scores)

    results.append({
        "text": text,
        "ground_truth": gt,
        "prediction": pred
    })


100%|██████████| 100/100 [04:21<00:00,  2.62s/it]


In [9]:
from sklearn.metrics import precision_recall_fscore_support

y_true = {e: [] for e in EMOTIONS_8}
y_pred = {e: [] for e in EMOTIONS_8}

for r in results:
    for e in EMOTIONS_8:
        y_true[e].append(r["ground_truth"][e])
        y_pred[e].append(r["prediction"][e])

report = {}

for e in EMOTIONS_8:
    p, r, f1, _ = precision_recall_fscore_support(
        y_true[e],
        y_pred[e],
        average="binary",
        zero_division=0
    )
    report[e] = {
        "precision": round(p, 3),
        "recall": round(r, 3),
        "f1": round(f1, 3)
    }


In [10]:
df_report = pd.DataFrame(report).T
df_report.loc["macro_avg"] = df_report.mean()

print(df_report)


              precision    recall     f1
acceptance      0.15300  0.818000  0.257
anger           0.25000  0.800000  0.381
anticipation    0.16100  0.556000  0.250
disgust         0.10000  1.000000  0.182
fear            0.07700  0.500000  0.133
joy             0.52200  0.857000  0.649
sadness         0.24100  0.778000  0.368
surprise        0.19000  0.500000  0.276
macro_avg       0.21175  0.726125  0.312


# Test 1
precision  recall        f1
acceptance     0.145000   0.889  0.250000
anger          0.303000   0.556  0.392000
anticipation   0.179000   1.000  0.303000
disgust        0.036000   1.000  0.069000
fear           0.273000   1.000  0.429000
joy            0.417000   0.714  0.526000
sadness        0.370000   0.769  0.500000
surprise       0.000000   0.000  0.000000
macro_avg      0.215375   0.741  0.308625

In [None]:
# Test 2
precision  recall        f1
acceptance        0.161   0.909  0.274000
anger             0.320   0.800  0.457000
anticipation      0.161   0.556  0.250000
disgust           0.095   1.000  0.174000
fear              0.077   0.500  0.133000
joy               0.556   0.893  0.685000
sadness           0.318   0.778  0.452000
surprise          0.200   0.500  0.286000
macro_avg         0.236   0.742  0.338875

In [None]:
# TEST 3
