# GoEmotions — Dataset Exploration

This notebook loads the GoEmotions dataset and explores the label distribution, co-occurrence patterns, and example comments.

In [None]:
# Install dependencies if needed
# !pip install datasets pandas matplotlib seaborn

In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load GoEmotions (raw split — multi-label annotations)
ds = load_dataset("google-research-datasets/go_emotions", "raw")
print(ds)

In [None]:
# Convert to DataFrames and save CSVs
df_train = ds["train"].to_pandas()
df_valid = ds["validation"].to_pandas()
df_test  = ds["test"].to_pandas()

df_train.to_csv("artifacts/goemotions_train.csv", index=False)
df_valid.to_csv("artifacts/goemotions_validation.csv", index=False)
df_test.to_csv("artifacts/goemotions_test.csv", index=False)

print(f"Train: {len(df_train)} | Validation: {len(df_valid)} | Test: {len(df_test)}")
df_train.head()

In [None]:
# Original 28 labels (27 emotions + neutral)
ORIGINAL_LABELS = [
    'admiration','amusement','anger','annoyance','approval','caring','confusion',
    'curiosity','desire','disappointment','disapproval','disgust','embarrassment',
    'excitement','fear','gratitude','grief','joy','love','nervousness','optimism',
    'pride','realization','relief','remorse','sadness','surprise','neutral'
]

# Mapping to our 7-class taxonomy
LABEL_MAP_7 = {
    'anger':['anger','annoyance','disapproval'],
    'disgust':['disgust'],
    'fear':['fear','nervousness'],
    'joy':['joy','amusement','approval','excitement','gratitude','love','optimism','pride','relief','admiration'],
    'neutral':['neutral','realization'],
    'sadness':['sadness','disappointment','grief','remorse','embarrassment'],
    'surprise':['surprise','confusion','curiosity','desire','caring']
}

print("Mapping defined.")

In [None]:
# Label frequency distribution (original 28)
label_counts = {label: 0 for label in ORIGINAL_LABELS}
for row in ds["train"]:
    for label in row["labels"]:
        label_counts[ORIGINAL_LABELS[label]] += 1

counts_series = pd.Series(label_counts).sort_values(ascending=False)

plt.figure(figsize=(14, 5))
colors = ['#9B59B6' if c > 5000 else '#444' for c in counts_series.values]
plt.bar(counts_series.index, counts_series.values, color=colors)
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.title("GoEmotions — Label Frequency (Train Split)", fontsize=13)
plt.tight_layout()
plt.savefig("artifacts/label_distribution.png", dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Print example comments per emotion
for emotion_7, sub_labels in LABEL_MAP_7.items():
    print(f"\n=== {emotion_7.upper()} ===")
    count = 0
    for row in ds["train"]:
        row_labels = [ORIGINAL_LABELS[l] for l in row["labels"]]
        if any(sub in row_labels for sub in sub_labels):
            print(f"  → {row['text'][:120]}")
            count += 1
            if count >= 3:
                break