<a href="https://colab.research.google.com/github/UpLiftL1f3/Emotion_Sentiment_ML/blob/main/Emotion_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1 — Load & sanity-check your CSVs (already in the env)

In [14]:
# Step 1: Load both CSVs safely (no extra installs needed)

import os, re, unicodedata
import pandas as pd
import numpy as np

# Change these if your files are elsewhere
SENTIMENT_CSV = "combined_sentiment_data.csv"
EMOTIONS_CSV  = "combined_emotion.csv"

for path in [SENTIMENT_CSV, EMOTIONS_CSV]:
    assert os.path.exists(path), f"Missing file: {path}. Upload or set the correct path."

# Load as strings; keep options permissive to avoid breaking on weird rows
read_opts = dict(
    dtype=str,
    na_values=["", " ", "NA", "NaN", "nan", None],
    keep_default_na=True,
    on_bad_lines="skip",  # skip malformed rows instead of crashing
    encoding_errors="replace"
)

df_sent_raw = pd.read_csv(SENTIMENT_CSV, **read_opts)
df_emot_raw = pd.read_csv(EMOTIONS_CSV, **read_opts)

# Helper: guess likely text/label columns; override if needed
def guess_cols(df):
    candidates_text  = {"text","tweet","content","sentence","comment","body","review"}
    candidates_label = {"label","sentiment","target","polarity","emotion"}
    text_col = next((c for c in df.columns if c.lower() in candidates_text), df.columns[0])
    label_col = next((c for c in df.columns if c.lower() in candidates_label), df.columns[-1])
    return text_col, label_col

sent_text_col, sent_label_col = guess_cols(df_sent_raw)
emot_text_col, emot_label_col = guess_cols(df_emot_raw)

# Quick peeks

print("Sentiment shape:", df_sent_raw.shape)
print("Sentiment columns:", df_sent_raw.columns.tolist())
print(f"Guessed sentiment text/label: {sent_text_col} / {sent_label_col}")
display(df_sent_raw[[sent_text_col, sent_label_col]].head())
print("\nSentiment label sample counts:")
display(df_sent_raw[sent_label_col].astype(str).str.strip().str.lower().value_counts().head(20))

print("\nEmotion shape:  ", df_emot_raw.shape)
print("Emotion columns:  ", df_emot_raw.columns.tolist())
print(f"Guessed emotion   text/label: {emot_text_col} / {emot_label_col}")
display(df_emot_raw[[emot_text_col, emot_label_col]].head())
print("\nEmotion label sample counts:")
display(df_emot_raw[emot_label_col].astype(str).str.strip().str.lower().value_counts().head(20))


Sentiment shape: (3309, 2)
Sentiment columns: ['sentence', 'sentiment']
Guessed sentiment text/label: sentence / sentiment


Unnamed: 0,sentence,sentiment
0,So there is no way for me to plug it in here i...,negative
1,"Good case, Excellent value.",positive
2,Great for the jawbone.,positive
3,Tied to charger for conversations lasting more...,negative
4,The mic is great.,positive



Sentiment label sample counts:


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,1679
negative,1630



Emotion shape:   (422746, 2)
Emotion columns:   ['sentence', 'emotion']
Guessed emotion   text/label: sentence / emotion


Unnamed: 0,sentence,emotion
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sad
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sad
4,i am a kindergarten teacher and i am thoroughl...,fear



Emotion label sample counts:


Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
joy,143067
sad,121187
anger,59317
fear,49649
love,34554
suprise,14972


# Step 2 — Clean missing/corrupt data & normalize labels

In [15]:
# Step 2: Define cleaning and build tidy frames

import re

URL_RE       = re.compile(r"https?://\S+|www\.\S+")
MENTION_RE   = re.compile(r"@\w+")
HASHTAG_RE   = re.compile(r"#(\w+)")
MULTISPACE_RE= re.compile(r"\s+")

def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200b", "")      # zero-width space
    s = s.lower()
    return s

def clean_text_basic(s: str) -> str:
    s = normalize_text(s)
    s = URL_RE.sub(" ", s)           # remove URLs
    s = MENTION_RE.sub(" ", s)       # remove @mentions
    s = HASHTAG_RE.sub(r"\1", s)     # keep hashtag word, drop '#'
    s = MULTISPACE_RE.sub(" ", s).strip()
    return s

def tidy_df(df, text_col, label_col, task_name):
    out = df[[text_col, label_col]].copy()
    out.columns = ["text", "label_raw"]
    # Basic cleanup
    out["text"] = out["text"].astype(str).map(clean_text_basic)
    out["label_raw"] = out["label_raw"].astype(str).str.strip()
    # Drop missing
    before = len(out)
    out = out.dropna(subset=["text","label_raw"])
    # Remove extremely short text (<=1 token)
    out["__len"] = out["text"].str.split().str.len()
    out = out[out["__len"] > 1].drop(columns="__len")
    # Drop exact duplicates (text+label)
    out = out.drop_duplicates(subset=["text","label_raw"])
    out["task"] = task_name
    out = out.reset_index(drop=True)
    print(f"[{task_name}] kept {len(out):,} rows (from {before:,})")
    return out

df_sent_tidy = tidy_df(df_sent_raw, sent_text_col, sent_label_col, "sentiment")
df_emot_tidy = tidy_df(df_emot_raw, emot_text_col, emot_label_col, "emotion")

# Normalize sentiment labels to 3-way when possible
def map_sentiment(lbl: str) -> str:
    t = lbl.strip().lower()
    if t in {"pos","positive","+1","1","favorable","positif","posi"}:
        return "positive"
    if t in {"neg","negative","-1","unfavorable","negatif"}:
        return "negative"
    if t in {"neu","neutral","0","neutre"}:
        return "neutral"
    return t  # leave as-is; we’ll see final unique set next

df_sent_tidy["label"] = df_sent_tidy["label_raw"].map(map_sentiment)

# If after mapping we have exactly {positive, negative, neutral}, great.
# If we only have two classes and one is "neutral", drop neutral to make it binary.
uniq = set(df_sent_tidy["label"].unique())
if len(uniq) == 2 and "neutral" in uniq:
    df_sent_tidy = df_sent_tidy[df_sent_tidy["label"] != "neutral"].reset_index(drop=True)

# Emotion: just lowercase/trim for consistency (many taxonomies exist)
df_emot_tidy["label"] = df_emot_tidy["label_raw"].astype(str).str.strip().str.lower()

print("\nSentiment classes & counts:")
display(df_sent_tidy["label"].value_counts())

print("\nEmotion classes & counts (top 15):")
display(df_emot_tidy["label"].value_counts().head(15))

# Save cleaned versions to reuse later
df_sent_tidy[["text","label"]].to_csv("cleaned_sentiment.csv", index=False)
df_emot_tidy[["text","label"]].to_csv("cleaned_emotions.csv", index=False)
print("\nSaved cleaned_sentiment.csv and cleaned_emotions.csv")


[sentiment] kept 3,273 rows (from 3,309)
[emotion] kept 416,098 rows (from 422,746)

Sentiment classes & counts:


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
positive,1659
negative,1614



Emotion classes & counts (top 15):


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
joy,140776
sad,120981
anger,57226
fear,47659
love,34497
suprise,14959



Saved cleaned_sentiment.csv and cleaned_emotions.csv


# Step 3 — Pre-tokenization prep (no modeling yet)

In [16]:
# Step 3: Make stratified train/val splits we’ll reuse later

from sklearn.model_selection import train_test_split

RNG = 42

# Reload from disk to ensure we truly depend on the cleaned outputs
df_sent = pd.read_csv("cleaned_sentiment.csv", dtype=str)
df_emot = pd.read_csv("cleaned_emotions.csv", dtype=str)

# Drop any late NaNs or empties defensively
df_sent = df_sent.dropna(subset=["text","label"])
df_sent = df_sent[df_sent["text"].str.strip().str.len() > 1]

df_emot = df_emot.dropna(subset=["text","label"])
df_emot = df_emot[df_emot["text"].str.strip().str.len() > 1]

# Stratified splits
sent_train, sent_val = train_test_split(
    df_sent, test_size=0.2, random_state=RNG, stratify=df_sent["label"]
)
emot_train, emot_val = train_test_split(
    df_emot, test_size=0.2, random_state=RNG, stratify=df_emot["label"]
)

print("Sentiment split:", len(sent_train), "train /", len(sent_val), "val")
print("Emotion split:  ", len(emot_train), "train /", len(emot_val), "val")

# (Optional) sanity: show class balance after split
def show_balance(name, train_df, val_df):
    print(f"\n{name} — Train balance:")
    display(train_df["label"].value_counts(normalize=True).round(3))
    print(f"{name} — Val balance:")
    display(val_df["label"].value_counts(normalize=True).round(3))

show_balance("Sentiment", sent_train, sent_val)
show_balance("Emotion",   emot_train, emot_val)


Sentiment split: 2618 train / 655 val
Emotion split:   332878 train / 83220 val

Sentiment — Train balance:


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
positive,0.507
negative,0.493


Sentiment — Val balance:


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
positive,0.507
negative,0.493



Emotion — Train balance:


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
joy,0.338
sad,0.291
anger,0.138
fear,0.115
love,0.083
suprise,0.036


Emotion — Val balance:


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
joy,0.338
sad,0.291
anger,0.138
fear,0.115
love,0.083
suprise,0.036
