# GoEmotions → Ekman Mapping

Aggregate the fine-grained GoEmotions labels into the Ekman-inspired taxonomy (anger, disgust, fear, joy, sadness, surprise, and neutral).

In [1]:
from __future__ import annotations

from pathlib import Path
from typing import Dict, List

import pandas as pd

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "dataset" / "go_emotions"
OUTPUT_DIR = BASE_DIR / "dataset" / "go_emotions_ekman"
OUTPUT_DIR.mkdir(exist_ok=True)

input_files = sorted(DATA_DIR.glob("goemotions_*.csv"))
print(f"Found {len(input_files)} GoEmotions shards")
for path in input_files:
    print(f" - {path.name}")

Found 3 GoEmotions shards
 - goemotions_1.csv
 - goemotions_2.csv
 - goemotions_3.csv


In [2]:
# Define the Ekman grouping
ekman_mapping: Dict[str, List[str]] = {
    "anger": ["anger", "annoyance", "disapproval"],
    "disgust": ["disgust"],
    "fear": ["fear", "nervousness"],
    "joy": [
        "admiration", "amusement", "approval", "caring", "desire",
        "excitement", "gratitude", "joy", "love", "optimism",
        "pride", "relief"
    ],
    "sadness": ["sadness", "disappointment", "embarrassment", "grief", "remorse"],
    "surprise": ["confusion", "curiosity", "realization", "surprise"],
}

all_required_labels = sorted({label for labels in ekman_mapping.values() for label in labels} | {"neutral"})
print(f"Expecting {len(all_required_labels)} base emotion columns")
print(all_required_labels)

Expecting 28 base emotion columns
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'neutral', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise']


In [3]:
def to_ekman(df: pd.DataFrame) -> pd.DataFrame:
    missing = [label for label in all_required_labels if label not in df.columns]
    if missing:
        raise ValueError(f"Dataframe missing columns: {missing}")

    ekman_df = df.copy()
    for ekman_label, source_labels in ekman_mapping.items():
        ekman_df[ekman_label] = (ekman_df[source_labels].sum(axis=1) > 0).astype(int)
    ekman_df["neutral"] = ekman_df["neutral"].astype(int)

    # Keep original metadata columns + new Ekman columns
    meta_cols = [col for col in df.columns if col not in all_required_labels]
    ekman_cols = list(ekman_mapping.keys()) + ["neutral"]
    return ekman_df[meta_cols + ekman_cols]


In [4]:
converted_frames = []
for path in input_files:
    print(f"Processing {path.name}")
    shard_df = pd.read_csv(path)
    ekman_df = to_ekman(shard_df)
    output_path = OUTPUT_DIR / path.name.replace("goemotions_", "ekman_")
    ekman_df.to_csv(output_path, index=False)
    print(f"  -> saved {len(ekman_df):,} rows to {output_path.relative_to(BASE_DIR)}")
    converted_frames.append(ekman_df)

combined_df = pd.concat(converted_frames, ignore_index=True)
combined_path = OUTPUT_DIR / "goemotions_ekman_full.csv"
combined_df.to_csv(combined_path, index=False)
print(f"Combined dataset saved to {combined_path.relative_to(BASE_DIR)} with {len(combined_df):,} rows")

Processing goemotions_1.csv
  -> saved 70,000 rows to dataset/go_emotions_ekman/ekman_1.csv
Processing goemotions_2.csv
  -> saved 70,000 rows to dataset/go_emotions_ekman/ekman_2.csv
Processing goemotions_3.csv
  -> saved 71,225 rows to dataset/go_emotions_ekman/ekman_3.csv
Combined dataset saved to dataset/go_emotions_ekman/goemotions_ekman_full.csv with 211,225 rows


In [5]:
combined_df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,anger,disgust,fear,joy,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,0,0,1,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,0,0,0,0,0,1
