In [2]:
import pandas as pd
import json
from sklearn.utils import resample
import numpy as np

In [3]:
# Load the goemotions_1.csv file into a pandas DataFrame
goemotions_1_df = pd.read_csv("data/goemotions1.csv")
goemotions_2_df = pd.read_csv("data/goemotions2.csv")
goemotions_3_df = pd.read_csv("data/goemotions3.csv")


# Combine the datasets into a single DataFrame
# Load the emotions.txt file into a pandas DataFrame
emotions_df = pd.read_csv("data/emotions.txt", header=None, names=["emotion"])

# Combine the datasets into a single DataFrame
combined_df = pd.concat([goemotions_1_df, goemotions_2_df, goemotions_3_df], ignore_index=True)

In [4]:
# Print a nicely formatted list of all features in combined_df
print("Features in combined_df:")
for feature in combined_df.columns:
    print(f"- {feature}")

Features in combined_df:
- text
- id
- author
- subreddit
- link_id
- parent_id
- created_utc
- rater_id
- example_very_unclear
- admiration
- amusement
- anger
- annoyance
- approval
- caring
- confusion
- curiosity
- desire
- disappointment
- disapproval
- disgust
- embarrassment
- excitement
- fear
- gratitude
- grief
- joy
- love
- nervousness
- optimism
- pride
- realization
- relief
- remorse
- sadness
- surprise
- neutral


In [5]:
# Remove the specified features from combined_df
features_to_remove = ['author', 'id', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear']
combined_df = combined_df.drop(columns=features_to_remove)

#remove duplicates
#full obeservation duplicates  
combined_df = combined_df.drop_duplicates()
#text duplicates
combined_df = combined_df.drop_duplicates(subset='text')

print("Features in combined_df:")
for feature in combined_df.columns:
    print(f"- {feature}")


Features in combined_df:
- text
- admiration
- amusement
- anger
- annoyance
- approval
- caring
- confusion
- curiosity
- desire
- disappointment
- disapproval
- disgust
- embarrassment
- excitement
- fear
- gratitude
- grief
- joy
- love
- nervousness
- optimism
- pride
- realization
- relief
- remorse
- sadness
- surprise
- neutral


In [6]:
import pandas as pd
import json
from sklearn.utils import resample
import numpy as np
import pandas as pd
import json

def map_to_ekman(df: pd.DataFrame,
                 emotions_file: str,
                 mapping_file: str,
                 id_cols: list = None) -> pd.DataFrame:

    # 1) Load the GoEmotions label names
    with open(emotions_file, 'r') as f:
        go_labels = f.read().splitlines()
    
    # 2) Identify which columns to preserve
    if id_cols is None:
        id_cols = [df.columns[0], df.columns[1]]
    
    # 3) Extract & rename the 27 numeric columns
    numeric = df.drop(columns=id_cols).copy()
    if len(numeric.columns) != len(go_labels):
        raise ValueError(f"Expected {len(go_labels)} numeric cols, got {len(numeric.columns)}")
    numeric.columns = go_labels
    
    # 4) Load Ekman mapping and invert it so go_label → ekman_label
    with open(mapping_file, 'r') as f:
        ekman_to_go = json.load(f)
    go_to_ekman = {go: ek for ek, gos in ekman_to_go.items() for go in gos}
    
    # 5) Rename each GoEmotions column to its Ekman bucket
    numeric = numeric.rename(columns=go_to_ekman)
    
    # 6) Sum together any duplicate-ekman columns
    numeric = numeric.groupby(numeric.columns, axis=1).sum()
    
    # 7) Binarize: any positive → 1
    numeric = (numeric > 0).astype(int)
    
    # 😎 Re-assemble
    return pd.concat([df[id_cols].reset_index(drop=True),
                      numeric.reset_index(drop=True)],
                     axis=1)

In [7]:
combined_path = combined_df.to_csv('data/combined.csv', index=False)


In [8]:
emotions_txt = 'data/emotions.txt'
mapping_json = 'data/ekman_mapping.json'
combined_pd = 'data/combined.csv'

combined_ekman = map_to_ekman(
    combined_df,
    emotions_file=emotions_txt,
    mapping_file=mapping_json,
    id_cols=['text']       # if your df only has a "text" col before the 27 one-hots
)

# Now combined_ekman has one column "text" plus 6 columns: anger, disgust, fear, joy, sadness, surprise
combined_ekman.head()

Unnamed: 0,text,anger,disgust,fear,joy,neutral,sadness,surprise
0,That game hurt.,0,0,0,0,0,1,0
1,>sexuality shouldn’t be a grouping category I...,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",0,0,0,0,1,0,0
3,Man I love reddit.,0,0,0,1,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",0,0,0,0,1,0,0


In [9]:
from sklearn.model_selection import train_test_split

# Split the data into train, test, and validation sets
train, test = train_test_split(combined_ekman, test_size=0.2, random_state=42)
train, validation = train_test_split(train, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Print the sizes of each set
print(f"Train set size: {len(train)}")
print(f"Test set size: {len(test)}")
print(f"Validation set size: {len(validation)}")

Train set size: 34638
Test set size: 11547
Validation set size: 11547


In [10]:
# Count the number of rows for each unique emotion based on the emotion columns
emotion_columns = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']

# Sum the values for each emotion across all rows
emotion_counts = combined_ekman[emotion_columns].sum().sort_values(ascending=False)
print(emotion_counts)

joy         23098
neutral     15488
surprise     8057
anger        8027
sadness      4986
disgust      1349
fear         1210
dtype: int64


In [11]:
import sklearn
import imblearn

print("scikit-learn:", sklearn.__version__)
print("imbalanced-learn:", imblearn.__version__)

scikit-learn: 1.6.1
imbalanced-learn: 0.13.0


In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import json

# --- Step 1: Separate text and emotion columns ---
text_column = combined_ekman['text'].fillna("Missing text")  # Handle missing text early
emotions_data = combined_ekman.drop(columns=['text'])

# --- Step 2: Convert one-hot to single label ---
labels = emotions_data.idxmax(axis=1)

# --- Step 3: TF-IDF vectorization ---
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(text_column)

# --- Step 4: Define hybrid resampling strategy ---
smote = SMOTE(sampling_strategy={
    'disgust': 5000,
    'fear': 5000,
    'sadness': 5000
}, random_state=42)

undersample = RandomUnderSampler(sampling_strategy={
    'joy': 12000,
    'neutral': 12000
}, random_state=42)

resample_pipeline = Pipeline([
    ('smote', smote),
    ('under', undersample)
])

# --- Step 5: Apply resampling ---
X_resampled, y_resampled = resample_pipeline.fit_resample(X_tfidf, labels)

# --- Step 6: Map back to texts ---
text_resampled = text_column.iloc[y_resampled.index].reset_index(drop=True)

# --- Step 7: Convert labels back to one-hot ---
y_onehot = pd.get_dummies(y_resampled)

# Ensure all original emotion columns are present
for col in emotions_data.columns:
    if col not in y_onehot.columns:
        y_onehot[col] = 0

# Match column order
y_onehot = y_onehot[emotions_data.columns].reset_index(drop=True)

# --- Step 8: Build final DataFrame with all info ---
balanced_df = pd.concat([text_resampled, y_onehot], axis=1)

# Add single-label column (for model training if needed)
balanced_df['label'] = y_resampled.reset_index(drop=True)

# Add optional JSON-style 'emotions' column
balanced_df['emotions'] = balanced_df.apply(
    lambda row: json.dumps({'emotions': [col for col in emotions_data.columns if row[col] == 1]}),
    axis=1
)

# --- Step 9: Shuffle and return ---
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# --- Step 10: View result ---
balanced_df.head()





Unnamed: 0,text,anger,disgust,fear,joy,neutral,sadness,surprise,label,emotions
0,I don’t think I’m high enough to get this,0,0,1,0,0,0,0,fear,"{""emotions"": [""fear""]}"
1,Love you too,0,0,0,0,0,1,0,sadness,"{""emotions"": [""sadness""]}"
2,You are definently not in the wrong because yo...,0,0,0,1,0,0,0,joy,"{""emotions"": [""joy""]}"
3,Or... You just don't want to break your aging ...,0,0,0,0,1,0,0,neutral,"{""emotions"": [""neutral""]}"
4,If [NAME] comes out of his cave and sees his s...,0,0,0,1,0,0,0,joy,"{""emotions"": [""joy""]}"
