# Multi-Label Emotion Dataset Cleaning & Filtering
This notebook will:
1. Load the raw CSV  
2. Coerce emotion columns to numeric  
3. Count and preview rows with no emotions  
4. Filter them out  
5. Confirm the result  


In [1]:
!pip install beautifulsoup4




In [3]:
from bs4 import BeautifulSoup


## STEP 1: Dataset Filtering 

In [6]:
import pandas as pd

# Emotion label columns
emotion_cols = ['anger', 'fear', 'joy', 'sadness', 'surprise']


1. Load the raw data

In [9]:

df_raw = pd.read_csv("track-a.csv")
print(f"✅ Loaded {len(df_raw)} rows")


✅ Loaded 2768 rows


2. Coerce emotion columns to integers

In [12]:
df_raw[emotion_cols] = (
    df_raw[emotion_cols]
    .apply(pd.to_numeric, errors='coerce')  # any bad entries → NaN
    .fillna(0)                             # NaN → 0
    .astype(int)
)


3. Count rows with no emotions BEFORE filtering

In [15]:
# 3. Count rows with no emotions BEFORE filtering
zero_before = (df_raw[emotion_cols].sum(axis=1) == 0).sum()
print(f"Rows with no emotions (before): {zero_before}")


Rows with no emotions (before): 239


### Preview some zero-label rows

In [18]:
if zero_before > 0:
    display(df_raw[df_raw[emotion_cols].sum(axis=1) == 0].head())

Unnamed: 0,id,text,anger,fear,joy,sadness,surprise
3,eng_train_track_a_00004,"After all, I had vegetables coming out my ears...",0,0,0,0,0
13,eng_train_track_a_00014,"Now my parents live in the foothills, and the ...",0,0,0,0,0
50,eng_train_track_a_00051,painted on my face.,0,0,0,0,0
53,eng_train_track_a_00054,She left the room and went downstairs.,0,0,0,0,0
60,eng_train_track_a_00061,' I chanted in my head.,0,0,0,0,0


4. Filter out zero-emotion rows

In [21]:
df = df_raw[df_raw[emotion_cols].sum(axis=1) > 0].reset_index(drop=True)

#### 5. Confirm results

In [35]:
zero_after = (df[emotion_cols].sum(axis=1) == 0).sum()
print(f"Rows with no emotions (after): {zero_after}")
print(f"Dataset size: before={len(df_raw)}, after={len(df)}")

Rows with no emotions (after): 0
Dataset size: before=2768, after=2529



## 2. Advanced Text Cleaning
We’ll strip HTML tags, normalize Unicode, mask URLs/emails, preserve emojis, etc.


#### 1. Define a clean-text + BeautifulSoup cleaner

In [33]:
from cleantext import clean


In [39]:
# 3.1 Define a clean-text + BeautifulSoup cleaner
def clean_with_library(text: str) -> str:
    # 1) remove HTML tags/entities
    text = BeautifulSoup(text, "html.parser").get_text()
    # 2) call clean-text for Unicode/URLs/emails/etc.
    return clean(
        text,
        fix_unicode=True,
        to_ascii=False,
        lower=False,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en"
    )

# 3.2 Apply cleaning to a new column
df['clean_text'] = df['text'].apply(clean_with_library)


  text = BeautifulSoup(text, "html.parser").get_text()


### 4. Preview Raw vs. Cleaned Text

In [41]:
display(
    df[['text','clean_text']]
    .rename(columns={'text':'Raw','clean_text':'Cleaned'})
    .head(5)
)


Unnamed: 0,Raw,Cleaned
0,"Colorado, middle of nowhere.","Colorado, middle of nowhere."
1,This involved swimming a pretty large lake tha...,This involved swimming a pretty large lake tha...
2,It was one of my most shameful experiences.,It was one of my most shameful experiences.
3,Then the screaming started.,Then the screaming started.
4,"They don't fear death, and it seems they belie...","They don't fear death, and it seems they belie..."



## 5. Sanity-Check Cleaned Text
Scan for leftover artifacts—HTML entities, URLs, control chars, etc.


In [43]:
checks = {
    r"&[A-Za-z0-9#]+;":     "HTML entities",
    r"https?://\S+":        "URLs",
    r"<USER>|<EMAIL>|<URL>":"Placeholders",
    r"[\x00-\x1F\x7F]":     "Control chars"
}

for pattern, desc in checks.items():
    mask = df['clean_text'].str.contains(pattern, regex=True, na=False)
    cnt  = mask.sum()
    print(f"{desc:20s}: {cnt:3d} rows")
    if cnt > 0:
        print("  Examples:", df.loc[mask,'clean_text'].head(3).tolist())
    print()


HTML entities       :   0 rows

URLs                :   0 rows

Placeholders        :   0 rows

Control chars       :   0 rows



In [45]:
import pandas as pd
import re

# patterns to check for
checks = {
    "HTML entities":         r"&[A-Za-z0-9#]+;",
    "URLs":                  r"https?://\S+",
    "Placeholders":          r"<URL>|<EMAIL>|<PHONE>",
    "Control characters":    r"[\x00-\x1F\x7F]"
}

# summary rows
rows = []
for desc, pattern in checks.items():
    mask = df['clean_text'].str.contains(pattern, regex=True, na=False)
    count = mask.sum()
    examples = df.loc[mask, 'clean_text'].head(3).tolist()
    row = {
        "Artifact": desc,
        "Count": count,
        **{f"Example_{i+1}": ex for i, ex in enumerate(examples)}
    }
    rows.append(row)

# display the DataFrame
summary_df = pd.DataFrame(rows)
display(summary_df)


Unnamed: 0,Artifact,Count
0,HTML entities,0
1,URLs,0
2,Placeholders,0
3,Control characters,0


In [None]:
df.head()

In [47]:
import pandas as pd

EMOTION_COLS = ['anger','fear','joy','sadness','surprise']

# starting from your cleaned+filtered df
df2 = pd.DataFrame({
    'text':    df['clean_text'],
    'labels':  df[EMOTION_COLS].values.tolist()
})

# Preview
display(df2.head())


Unnamed: 0,text,labels
0,"Colorado, middle of nowhere.","[0, 1, 0, 0, 1]"
1,This involved swimming a pretty large lake tha...,"[0, 1, 0, 0, 0]"
2,It was one of my most shameful experiences.,"[0, 1, 0, 1, 0]"
3,Then the screaming started.,"[0, 1, 0, 1, 1]"
4,"They don't fear death, and it seems they belie...","[0, 1, 0, 0, 1]"


In [49]:
df2['labels'] = df.apply(
    lambda row: [emo for emo in EMOTION_COLS if row[emo] == 1],
    axis=1
)
display(df2.head())

Unnamed: 0,text,labels
0,"Colorado, middle of nowhere.","[fear, surprise]"
1,This involved swimming a pretty large lake tha...,[fear]
2,It was one of my most shameful experiences.,"[fear, sadness]"
3,Then the screaming started.,"[fear, sadness, surprise]"
4,"They don't fear death, and it seems they belie...","[fear, surprise]"


In [51]:
!pip install neattext
import neattext.functions as nfx




In [52]:
# Remove the user handles
df2['Clean_Text'] = df['text'].apply(nfx.remove_userhandles)

In [56]:
# Remove the stopwords
df2['Clean_Text'] = df['text'].apply(nfx.remove_stopwords)

In [58]:
df2

Unnamed: 0,text,labels,Clean_Text
0,"Colorado, middle of nowhere.","[fear, surprise]","Colorado, middle nowhere."
1,This involved swimming a pretty large lake tha...,[fear],involved swimming pretty large lake head.
2,It was one of my most shameful experiences.,"[fear, sadness]",shameful experiences.
3,Then the screaming started.,"[fear, sadness, surprise]",screaming started.
4,"They don't fear death, and it seems they belie...","[fear, surprise]","fear death, believe reincarnation."
...,...,...,...
2524,"like, brain freeze on my leg!!","[fear, sadness, surprise]","like, brain freeze leg!!"
2525,She cants her hip against my waist into my sid...,"[joy, surprise]","cants hip waist side, knocking stumble tugging..."
2526,It just kind of gradually vanished over a coup...,[surprise],kind gradually vanished couple hours.
2527,I didn't look out of my hands.,[fear],look hands.


In [None]:
df2.to_csv("cleaned_emotion_dataset.csv", index=False)
