# 01_prepare_dataset.ipynb

Prepare the Kaggle **Emotions** dataset for ingestion by the probabilistic model.

This notebook:
1. standardizes schema and labels
2. applies minimal text hygiene
3. removes duplicates
4. performs light EDA
5. persists a single cleaned CSV with artifacts for reproducibility.



## Setup Environment and Paths; validate raw dataset existence

In [1]:
# Standard libs
import sys, os, platform, json, re, unicodedata
from pathlib import Path
from datetime import datetime, timezone

# Third-party
import pandas as pd
import numpy as np

print("Python:", sys.version)
print("OS:", platform.platform())
print("NumPy:", np.__version__)

# Reproducibility
SEED = 42

# Paths
DATA_ROOT = Path("../data")
DATA_RAW   = DATA_ROOT / "data_raw"
DATA_CLEAN = DATA_ROOT / "data_clean"
ARTIFACTS  = Path("../artifacts") / "outputs"

# Create output directories if needed
for p in (DATA_RAW, DATA_CLEAN, ARTIFACTS):
    p.mkdir(parents=True, exist_ok=True)

print("Data root:", DATA_ROOT.resolve())
print("Artifacts:", ARTIFACTS.resolve())

# Point RAW_CSV to the Kaggle file under data_raw/
# This picks the first *.csv found.
RAW_CSV = next((p for p in DATA_RAW.glob("*.csv")), None)
print(f"Detected RAW_CSV: {RAW_CSV}")
if RAW_CSV is None:
    raise FileNotFoundError("Place the Kaggle CSV in data_raw/ before running (e.g., data_raw/emotions.csv)")

# Canonical six-class emotion taxonomy
LABELS = ["sadness","joy","love","anger","fear","surprise"]
LABEL_TO_ID = {name:i for i,name in enumerate(LABELS)}
ID_TO_LABEL = {i:name for name,i in LABEL_TO_ID.items()}

pd.set_option('display.max_colwidth', 120)
print("Setup complete.")


Python: 3.13.9 (tags/v3.13.9:8183fa5, Oct 14 2025, 14:09:13) [MSC v.1944 64 bit (AMD64)]
OS: Windows-11-10.0.26200-SP0
NumPy: 2.2.6
Data root: C:\Code\d804-advanced-ai-for-computer-scientists\D804_PA_Model_TextEmotionNB\data
Artifacts: C:\Code\d804-advanced-ai-for-computer-scientists\D804_PA_Model_TextEmotionNB\artifacts\outputs
Detected RAW_CSV: ..\data\data_raw\emotions.csv
Setup complete.


## Load Raw Data and Schema Checks

In [2]:
df_raw = pd.read_csv(RAW_CSV)
print("Raw shape:", df_raw.shape)
print("Raw columns:", list(df_raw.columns))

# Heuristics to find text & label columns
possible_text_cols  = [c for c in df_raw.columns if c.lower() in {"text","content","sentence","tweet","message","comment"}]
possible_label_cols = [c for c in df_raw.columns if c.lower() in {"label","emotion","target","class"}]

if not possible_text_cols or not possible_label_cols:
    raise ValueError(f"Could not infer text/label columns; got text={possible_text_cols}, label={possible_label_cols}")

TEXT_COL  = possible_text_cols[0]
LABEL_COL = possible_label_cols[0]
print(f"Using TEXT_COL='{TEXT_COL}', LABEL_COL='{LABEL_COL}'")

df = df_raw[[TEXT_COL, LABEL_COL]].copy()
df.columns = ["text","label"]

# Basic dtype normalization
df["text"] = df["text"].astype(str)
print(df.dtypes)
df.head(5)


Raw shape: (416809, 2)
Raw columns: ['text', 'label']
Using TEXT_COL='text', LABEL_COL='label'
text     object
label     int64
dtype: object


Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax and unwind and frankly needed it after those last few weeks around the ...,0
2,i gave up my internship with the dmrg and am feeling distraught,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughly weary of my job after having taken the university entrance exam i su...,4


## Label Normalization

Normalize labels to the fixed six-class taxonomy and create stable integer ids.


In [3]:
def normalize_label(val):
    """
    Normalize raw labels to one of six canonical emotion names:
    sadness, joy, love, anger, fear, or surprise.
    """
    if pd.isna(val):
        return np.nan

    # integer-coded labels (0–5) → map directly
    try:
        ival = int(val)
        if 0 <= ival <= 5:
            return ID_TO_LABEL[ival]
    except Exception:
        pass

    # string-based labels → clean and map
    sval = str(val).strip().lower()
    aliases = {
        "happiness": "joy",
        "scared": "fear",
        "angry": "anger",
        "surprised": "surprise",
        "sad": "sadness"
    }
    sval = aliases.get(sval, sval)
    return sval if sval in LABELS else np.nan

df["label_name"] = df["label"].apply(normalize_label)

bad = df[df["label_name"].isna()]
if not bad.empty:
    print("Warning: Found unmapped labels. Displaying first 10:")
    display(bad.head(10))
    df = df[~df["label_name"].isna()]

# Stable integer ID for each label
df["label_id"] = df["label_name"].map(LABEL_TO_ID)
assert df["label_id"].between(0, 5).all(), "Unexpected label IDs after normalization"

print("Label normalization complete.")
df[["text", "label", "label_name", "label_id"]].head(5)


Label normalization complete.


Unnamed: 0,text,label,label_name,label_id
0,i just feel really helpless and heavy hearted,4,fear,4
1,ive enjoyed being able to slouch about relax and unwind and frankly needed it after those last few weeks around the ...,0,sadness,0
2,i gave up my internship with the dmrg and am feeling distraught,4,fear,4
3,i dont know i feel so lost,0,sadness,0
4,i am a kindergarten teacher and i am thoroughly weary of my job after having taken the university entrance exam i su...,4,fear,4


## Perform Minimal Text Hygiene

Perform minimal, deterministic cleaning that does not depend on my model choices.


In [4]:
LOWERCASE = True
REMOVE_URLS = True
REMOVE_EMAILS = True
REMOVE_CONTROL_CHARS = True
NORMALIZE_SPACES = True

URL_RE   = re.compile(r"https?://\S+|www\.\S+")
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    t = s.strip()

    if REMOVE_CONTROL_CHARS:
        # remove control chars while preserving newlines
        t = "".join(ch for ch in t if unicodedata.category(ch)[0] != "C" or ch == "\n")

    if REMOVE_URLS:
        t = URL_RE.sub(" ", t)
    if REMOVE_EMAILS:
        t = EMAIL_RE.sub(" ", t)

    if LOWERCASE:
        t = t.lower()

    if NORMALIZE_SPACES:
        t = re.sub(r"\s+", " ", t).strip()

    return t

before_rows = len(df)
df["text"] = df["text"].apply(clean_text)

# Drop null/empty texts
df = df[~df["text"].isna()]
df = df[df["text"].str.len() > 0]

# Drop exact duplicate texts within the same label
# if label_name exists, de-dupe on (text,label_name).
# if not, fall back to de-dupe on text alone.
if "label_name" in df.columns:
    df = df.drop_duplicates(subset=["text","label_name"], keep="first")
else:
    df = df.drop_duplicates(subset=["text"], keep="first")

after_rows = len(df)
print(f"Rows before cleaning: {before_rows} | after: {after_rows}")
df.head(5)


Rows before cleaning: 416809 | after: 416123


Unnamed: 0,text,label,label_name,label_id
0,i just feel really helpless and heavy hearted,4,fear,4
1,ive enjoyed being able to slouch about relax and unwind and frankly needed it after those last few weeks around the ...,0,sadness,0
2,i gave up my internship with the dmrg and am feeling distraught,4,fear,4
3,i dont know i feel so lost,0,sadness,0
4,i am a kindergarten teacher and i am thoroughly weary of my job after having taken the university entrance exam i su...,4,fear,4


## Perform light EDA (counts & lengths)


In [5]:
# Ensure the use of canonical order everywhere
assert set(df["label_name"].unique()).issubset(set(LABELS)), "Found unexpected labels outside the canonical set."

# Per-class counts (ordered by LABELS)
counts = df["label_name"].value_counts().reindex(LABELS, fill_value=0)
display(counts)

# Length stats (characters), aligned to LABELS
df["char_len"] = df["text"].str.len()
char_stats = (
    df.groupby("label_name")["char_len"]
      .describe()
      .reindex(LABELS)
)

# Token counts, aligned to LABELS
df["token_len"] = df["text"].str.split().apply(len)
token_stats = (
    df.groupby("label_name")["token_len"]
      .describe()
      .reindex(LABELS)
)

display(char_stats)
display(token_stats)

# Persist EDA summary with deterministic class order
eda = {
    "labels_order": LABELS,
    "total_rows": int(len(df)),
    "per_class_counts": {lbl: int(counts[lbl]) for lbl in LABELS},
    "char_len": {
        lbl: {stat: float(char_stats.loc[lbl][stat]) for stat in ["count","mean","std","min","25%","50%","75%","max"]}
        for lbl in LABELS
    },
    "token_len": {
        lbl: {stat: float(token_stats.loc[lbl][stat]) for stat in ["count","mean","std","min","25%","50%","75%","max"]}
        for lbl in LABELS
    }
}

(ARTIFACTS / "eda.json").write_text(json.dumps(eda, indent=2), encoding="utf-8")
print("Saved", ARTIFACTS / "eda.json")


label_name
sadness     120989
joy         140779
love         34497
anger        57235
fear         47664
surprise     14959
Name: count, dtype: int64

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
sadness,120989.0,93.191455,55.842913,2.0,50.0,81.0,123.0,525.0
joy,140779.0,98.851341,55.68733,4.0,56.0,88.0,130.0,470.0
love,34497.0,104.729223,56.693039,10.0,61.0,94.0,137.0,300.0
anger,57235.0,96.093754,57.056206,2.0,52.0,84.0,127.0,830.0
fear,47664.0,96.728097,56.265081,2.0,53.0,85.0,128.0,411.0
surprise,14959.0,99.745905,55.930045,12.0,57.0,89.0,130.0,299.0


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
sadness,120989.0,18.500607,11.00394,1.0,10.0,16.0,25.0,101.0
joy,140779.0,19.557342,10.900616,1.0,11.0,18.0,26.0,94.0
love,34497.0,20.830739,11.146422,3.0,12.0,19.0,27.0,71.0
anger,57235.0,19.047716,11.269031,1.0,10.0,17.0,25.0,178.0
fear,47664.0,18.968614,11.081392,1.0,10.0,17.0,25.0,79.0
surprise,14959.0,19.755866,11.021853,3.0,11.0,18.0,26.0,66.0


Saved ..\artifacts\outputs\eda.json


## Persist Cleaned Dataset & Artifacts


In [6]:
# Reset index to get a stable id column
df = df.reset_index(drop=True).reset_index(names="id")
out_cols = ["id", "text", "label_name", "label_id"]

# Save clean dataset
out_csv = DATA_CLEAN / "clean.csv"
df[out_cols].to_csv(out_csv, index=False, encoding="utf-8")
print(f"Wrote {out_csv} with {len(df)} rows.")

# Save label map
label_map = {
    "label_to_id": LABEL_TO_ID,
    "id_to_label": ID_TO_LABEL
}
(ARTIFACTS / "label_map.json").write_text(
    json.dumps(label_map, indent=2),
    encoding="utf-8"
)
print("Saved", ARTIFACTS / "label_map.json")

# Save manifest (provenance, cleaning config, etc.)
manifest = {
    "source_csv": str(RAW_CSV),
    "created_utc": datetime.now(timezone.utc).isoformat(),
    "rows_clean": int(len(df)),
    "notes": (
        "No train/val/test splits or vectorizer fitting performed in this notebook. "
        "Splits and feature extraction occur in the model notebook to prevent leakage."
    ),
    "config": {
        "LOWERCASE": LOWERCASE,
        "REMOVE_URLS": REMOVE_URLS,
        "REMOVE_EMAILS": REMOVE_EMAILS,
        "REMOVE_CONTROL_CHARS": REMOVE_CONTROL_CHARS,
        "NORMALIZE_SPACES": NORMALIZE_SPACES
    }
}
(ARTIFACTS / "manifest.json").write_text(
    json.dumps(manifest, indent=2),
    encoding="utf-8"
)
print("Saved", ARTIFACTS / "manifest.json")


Wrote ..\data\data_clean\clean.csv with 416123 rows.
Saved ..\artifacts\outputs\label_map.json
Saved ..\artifacts\outputs\manifest.json
