In [1]:
# ============================================
# ðŸ“˜ Emotion Dataset Conversion Notebook
# Converts train/test/val .txt files â†’ single emotion_dataset.csv
# ============================================

import pandas as pd
import os

# ðŸ’¡ Change this if your data folder path differs
DATA_DIR = "../data"

# -----------------------------
# Step 1: Load text files
# -----------------------------
def load_emotion_file(file_path):
    """Read a text file of 'text;emotion' lines into a DataFrame"""
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            if ';' in line:
                text, emotion = line.strip().split(';')
                data.append([text, emotion])
    return pd.DataFrame(data, columns=["text", "emotion"])

# -----------------------------
# Step 2: Load all datasets
# -----------------------------
train_path = os.path.join(DATA_DIR, "train.txt")
test_path  = os.path.join(DATA_DIR, "test.txt")
val_path   = os.path.join(DATA_DIR, "val.txt")

print("ðŸ“‚ Loading dataset files from:", DATA_DIR)

train_df = load_emotion_file(train_path)
test_df  = load_emotion_file(test_path)
val_df   = load_emotion_file(val_path)

# -----------------------------
# Step 3: Combine them
# -----------------------------
df = pd.concat([train_df, test_df, val_df]).reset_index(drop=True)

# -----------------------------
# Step 4: Clean and inspect
# -----------------------------
print("âœ… Combined dataset shape:", df.shape)
print("ðŸ”¹ Sample rows:")
print(df.head())

print("\nðŸ”¸ Emotion distribution:")
print(df["emotion"].value_counts())

# -----------------------------
# Step 5: Save as CSV
# -----------------------------
output_path = os.path.join(DATA_DIR, "emotion_dataset.csv")
df.to_csv(output_path, index=False, encoding='utf-8')
print("\nðŸ’¾ Saved combined dataset to:", output_path)


ðŸ“‚ Loading dataset files from: ../data
âœ… Combined dataset shape: (20000, 2)
ðŸ”¹ Sample rows:
                                                text  emotion
0                            i didnt feel humiliated  sadness
1  i can go from feeling so hopeless to so damned...  sadness
2   im grabbing a minute to post i feel greedy wrong    anger
3  i am ever feeling nostalgic about the fireplac...     love
4                               i am feeling grouchy    anger

ðŸ”¸ Emotion distribution:
emotion
joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
Name: count, dtype: int64

ðŸ’¾ Saved combined dataset to: ../data/emotion_dataset.csv
