<a href="https://colab.research.google.com/github/UpLiftL1f3/Emotion_Sentiment_ML/blob/main/Emotion_Sentiment_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# upgrade pip tooling first
# %pip install -q --upgrade pip setuptools wheel

# remove any conflicting wheels that may be half-installed
# %pip uninstall -y -q numpy scipy scikit-learn array-api-compat

# install a compatible trio for Python 3.12
# %pip install -q --no-cache-dir --force-reinstall \
#  "numpy==2.1.2" "scipy==1.14.1" "scikit-learn==1.5.2"


In [None]:
# import os, time
# print("Restarting runtime to finalize installs…")
# time.sleep(1)
# os._exit(0)


# Imports + versions

In [1]:
import os, re, unicodedata, inspect
import numpy as np
import pandas as pd

import numpy as _np, scipy as _scipy, sklearn as _sk
print("NumPy:", _np.__version__)
print("SciPy:", _scipy.__version__)
print("sklearn:", _sk.__version__)


NumPy: 2.0.2
SciPy: 1.16.3
sklearn: 1.6.1


# Load both CSVs (raw)

In [3]:
SENTIMENT_CSV = "combined_sentiment_data.csv"
EMOTIONS_CSV  = "combined_emotion.csv"


read_opts = dict(
    dtype=str,
    na_values=["", " ", "NA", "NaN", "nan", None],
    keep_default_na=True,
    on_bad_lines="skip",
    encoding_errors="replace"
)

df_sent_raw = pd.read_csv(SENTIMENT_CSV, **read_opts)
df_emot_raw = pd.read_csv(EMOTIONS_CSV, **read_opts)

def guess_cols(df):
    candidates_text  = {"text","tweet","content","sentence","comment","body","review"}
    candidates_label = {"label","sentiment","target","polarity","emotion"}
    text_col  = next((c for c in df.columns if c.lower() in candidates_text),  df.columns[0])
    label_col = next((c for c in df.columns if c.lower() in candidates_label), df.columns[-1])
    return text_col, label_col

sent_text_col, sent_label_col = guess_cols(df_sent_raw)
emot_text_col, emot_label_col = guess_cols(df_emot_raw)

print("Sentiment shape:", df_sent_raw.shape, "| guessed:", sent_text_col, "/", sent_label_col)
print("Emotion   shape:", df_emot_raw.shape, "| guessed:", emot_text_col, "/", emot_label_col)


Sentiment shape: (3309, 2) | guessed: sentence / sentiment
Emotion   shape: (131367, 2) | guessed: sentence / emotion


# Clean + encode (for both datasets)

In [4]:
from sklearn.preprocessing import LabelEncoder

def clean_and_encode(df_raw, text_col, label_col):
    df = df_raw.rename(columns={text_col: "text", label_col: "label"}).copy()
    df = df.dropna(subset=["text","label"])
    df["text"] = df["text"].astype(str).str.strip()
    df = df[df["text"].str.len() > 0]

    # dedupe to avoid leakage later
    df["_norm"] = (
        df["text"].str.lower()
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    df = df.drop_duplicates(subset=["_norm","label"]).drop(columns=["_norm"]).reset_index(drop=True)

    # label encode
    le = LabelEncoder()
    df["label_encoded"] = le.fit_transform(df["label"].astype(str))
    id2label = {i: lab for i, lab in enumerate(le.classes_)}
    label2id = {lab: i for i, lab in id2label.items()}
    return df, id2label, label2id, len(id2label)

df_sent, id2label_sent, label2id_sent, num_labels_sent = clean_and_encode(df_sent_raw, sent_text_col, sent_label_col)
df_emot, id2label_emot, label2id_emot, num_labels_emot = clean_and_encode(df_emot_raw, emot_text_col, emot_label_col)

print(f"[Sentiment] rows={len(df_sent)} classes={num_labels_sent}")
print(f"[Emotion]   rows={len(df_emot)} classes={num_labels_emot}")


[Sentiment] rows=3286 classes=2
[Emotion]   rows=131283 classes=6


# Build a combined multitask dataframe

In [5]:
import pandas as pd
import numpy as np

IGNORE_INDEX = -100  # tells CE loss to ignore missing labels

# Rename encoded columns to task-specific names
sent_df = df_sent[["text", "label_encoded"]].rename(columns={"label_encoded": "label_sent"})
emot_df = df_emot[["text", "label_encoded"]].rename(columns={"label_encoded": "label_emot"})

# Tag rows and add masked columns
sent_df["label_emot"] = IGNORE_INDEX
emot_df["label_sent"] = IGNORE_INDEX

# Union of both datasets (rows from each carry one valid label and one masked)
mtl_df = pd.concat([sent_df, emot_df], ignore_index=True)

# (Optional) Shuffle for mixing tasks
mtl_df = mtl_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Quick peek
print(mtl_df.head(3))
print("Counts:",
      "\n rows:", len(mtl_df),
      "\n sentiment-labeled:", (mtl_df["label_sent"] != IGNORE_INDEX).sum(),
      "\n emotion-labeled:  ", (mtl_df["label_emot"] != IGNORE_INDEX).sum())


                                                text  label_sent  label_emot
0  i am actually feeling pretty good hence the ea...        -100           2
1  i see the pictures he posts on facebook from h...        -100           0
2  i forgive myself for accepting and allowing my...        -100           4
Counts: 
 rows: 134569 
 sentiment-labeled: 3286 
 emotion-labeled:   131283


# Train/Val/Test split (stratify by “has which label”)

In [6]:
from sklearn.model_selection import train_test_split

RNG = 42
has_sent = (mtl_df["label_sent"] != IGNORE_INDEX).astype(int)
has_emot = (mtl_df["label_emot"] != IGNORE_INDEX).astype(int)

# We stratify by which task label exists to keep task mix similar across splits
train_df, hold_df = train_test_split(
    mtl_df, test_size=0.20, random_state=RNG,
    stratify=has_sent + 2*has_emot  # four combos (0/1 per task)
)
val_df, test_df = train_test_split(
    hold_df, test_size=0.50, random_state=RNG,
    stratify=(hold_df["label_sent"] != IGNORE_INDEX).astype(int) + 2*(hold_df["label_emot"] != IGNORE_INDEX).astype(int)
)

print("Sizes → train/val/test:", len(train_df), len(val_df), len(test_df))
print("Train task mix:",
      (train_df["label_sent"] != IGNORE_INDEX).sum(),
      (train_df["label_emot"] != IGNORE_INDEX).sum())


Sizes → train/val/test: 107655 13457 13457
Train task mix: 2629 105026


# Tokenizer + HF Datasets (labels per task)

In [7]:
from datasets import Dataset, Value
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tok(batch):
    enc = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    enc["labels_sent"] = batch["label_sent"]
    enc["labels_emot"] = batch["label_emot"]
    return enc

def to_ds(pdf):
    pdf = pdf.copy()
    pdf["label_sent"] = pdf["label_sent"].astype("int64")
    pdf["label_emot"] = pdf["label_emot"].astype("int64")
    ds = Dataset.from_pandas(pdf[["text","label_sent","label_emot"]], preserve_index=False)
    ds = ds.map(tok, batched=True, remove_columns=["text"])
    ds = ds.cast_column("labels_sent", Value("int64"))
    ds = ds.cast_column("labels_emot", Value("int64"))
    ds.set_format(type="torch",
                  columns=["input_ids","attention_mask","labels_sent","labels_emot"])
    return ds

train_ds = to_ds(train_df)
val_ds   = to_ds(val_df)
test_ds  = to_ds(test_df)

print("len(train_ds):", len(train_ds))
print("feature keys:", train_ds.features.keys())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/107655 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/107655 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/107655 [00:00<?, ? examples/s]

Map:   0%|          | 0/13457 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13457 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13457 [00:00<?, ? examples/s]

Map:   0%|          | 0/13457 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13457 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/13457 [00:00<?, ? examples/s]

len(train_ds): 107655
feature keys: dict_keys(['label_sent', 'label_emot', 'input_ids', 'attention_mask', 'labels_sent', 'labels_emot'])


# Multi-head model (shared DistilBERT + 2 classifiers)

In [8]:
import torch
import torch.nn as nn
from transformers import AutoModel, PreTrainedModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput

num_labels_s = num_labels_sent
num_labels_e = num_labels_emot
IGNORE_INDEX = -100

class MultiHeadDistilBert(nn.Module):
    def __init__(self, base_name, num_labels_sent, num_labels_emot):
        super().__init__()
        self.config = AutoConfig.from_pretrained(base_name)
        self.encoder = AutoModel.from_pretrained(base_name)
        hidden = self.config.dim  # DistilBERT hidden size
        self.dropout = nn.Dropout(0.2)
        self.classifier_sent = nn.Linear(hidden, num_labels_sent)
        self.classifier_emot = nn.Linear(hidden, num_labels_emot)
        self.loss_sent = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
        self.loss_emot = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)

    def forward(self, input_ids=None, attention_mask=None,
                labels_sent=None, labels_emot=None):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # DistilBERT has no pooled output; use CLS token [0]
        cls = out.last_hidden_state[:, 0]  # [batch, hidden]
        cls = self.dropout(cls)

        logits_sent = self.classifier_sent(cls)   # [batch, C_s]
        logits_emot = self.classifier_emot(cls)   # [batch, C_e]

        loss = None
        losses = []
        if labels_sent is not None:
            losses.append(self.loss_sent(logits_sent, labels_sent))
        if labels_emot is not None:
            losses.append(self.loss_emot(logits_emot, labels_emot))
        if losses:
            # average when both present; single when only one present
            loss = sum(losses) / len(losses)

        # Trainer expects 'loss' and 'logits'. We'll also return task logits separately.
        logits_cat = torch.cat([logits_sent, logits_emot], dim=1)
        return {"loss": loss,
                "logits": logits_cat,
                "logits_sent": logits_sent,
                "logits_emot": logits_emot}

model = MultiHeadDistilBert(model_name, num_labels_s, num_labels_e)
sum_params = sum(p.numel() for p in model.parameters())
print("Params (M):", round(sum_params/1e6, 3))


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Params (M): 66.369


# Trainer (we’ll compute metrics ourselves later)

In [9]:
import inspect, transformers
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

print("Transformers:", transformers.__version__)
EVAL_ARG_NAME = "eval_strategy" if "eval_strategy" in inspect.signature(TrainingArguments.__init__).parameters else "evaluation_strategy"

training_kwargs = dict(
    output_dir="mtl_out",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    seed=42,
    fp16=True,                 # use mixed precision on GPU
    dataloader_num_workers=2,  # a bit more I/O parallelism

)
training_kwargs[EVAL_ARG_NAME] = "epoch"
training_args = TrainingArguments(**training_kwargs)

# No compute_metrics here—Trainer can’t natively handle two heads in its default pipeline.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

n_total = sum(p.numel() for p in model.parameters())
n_train = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Params total={n_total:,} | trainable={n_train:,}")
trainer.train()


Transformers: 4.57.1
Params total=66,369,032 | trainable=66,369,032


Epoch,Training Loss,Validation Loss
1,0.441,
2,0.1536,
3,0.0606,


TrainOutput(global_step=40371, training_loss=0.21840886320152073, metrics={'train_runtime': 1888.0695, 'train_samples_per_second': 228.074, 'train_steps_per_second': 28.51, 'total_flos': 0.0, 'train_loss': 0.21840886320152073, 'epoch': 3.0})

# Evaluate per task (VAL + TEST) with our own loop

In [10]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def eval_task(ds, task="sent"):
    loader = torch.utils.data.DataLoader(ds, batch_size=64)
    model.eval()
    preds, refs = [], []
    key = "labels_sent" if task == "sent" else "labels_emot"
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(model.classifier_sent.weight.device) if hasattr(v, "to") else v for k,v in batch.items()}
            out = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            logits = out["logits_sent"] if task == "sent" else out["logits_emot"]
            y = batch[key]
            # keep only rows where label != IGNORE_INDEX
            mask = (y != IGNORE_INDEX)
            if mask.sum() == 0:
                continue
            logits = logits[mask]
            y = y[mask]
            preds.extend(logits.argmax(dim=1).cpu().numpy().tolist())
            refs.extend(y.cpu().numpy().tolist())
    if len(refs) == 0:
        return {}
    acc = accuracy_score(refs, preds)
    avg = "binary" if (task=="sent" and num_labels_s==2) else "macro"
    pr = precision_score(refs, preds, average=avg, zero_division=0)
    rc = recall_score(refs, preds, average=avg, zero_division=0)
    f1 = f1_score(refs, preds, average=avg, zero_division=0)
    f1_micro = f1_score(refs, preds, average="micro", zero_division=0)
    return {"accuracy":acc, "precision":pr, "recall":rc, "f1":f1, "f1_micro":f1_micro}

print("VAL  — sentiment:", eval_task(val_ds, "sent"))
print("VAL  — emotion  :", eval_task(val_ds, "emot"))
print("TEST — sentiment:", eval_task(test_ds, "sent"))
print("TEST — emotion  :", eval_task(test_ds, "emot"))


VAL  — sentiment: {'accuracy': 0.9054878048780488, 'precision': 0.9383561643835616, 'recall': 0.8616352201257862, 'f1': 0.898360655737705, 'f1_micro': 0.9054878048780488}
VAL  — emotion  : {'accuracy': 0.9392185238784371, 'precision': 0.8965149212096718, 'recall': 0.93881903009961, 'f1': 0.9124850324499761, 'f1_micro': 0.9392185238784371}
TEST — sentiment: {'accuracy': 0.9057750759878419, 'precision': 0.930635838150289, 'recall': 0.8944444444444445, 'f1': 0.9121813031161473, 'f1_micro': 0.9057750759878419}
TEST — emotion  : {'accuracy': 0.9400517976843388, 'precision': 0.8994921401097105, 'recall': 0.9384869278681515, 'f1': 0.9132911533449581, 'f1_micro': 0.9400517976843388}


# Inference helper (single text → both predictions)

In [11]:
import torch
import numpy as np

def predict_both(texts, topk_emotion=3):
    if isinstance(texts, str):
        texts = [texts]
    enc = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    enc = {k: v.to(next(model.parameters()).device) for k,v in enc.items()}
    model.eval()
    with torch.no_grad():
        out = model(**enc)
        ls, le = out["logits_sent"], out["logits_emot"]
        ps = torch.softmax(ls, dim=-1).cpu().numpy()
        pe = torch.softmax(le, dim=-1).cpu().numpy()

    # decode using task label maps
    id2s = id2label_sent
    id2e = id2label_emot

    results = []
    for i in range(len(texts)):
        # sentiment top-1
        s_idx = int(ps[i].argmax())
        s = {"label": id2s[s_idx], "score": float(ps[i][s_idx])}
        # emotion top-k
        top_idx = np.argsort(-pe[i])[:topk_emotion]
        e = [{"label": id2e[int(j)], "score": float(pe[i][j])} for j in top_idx]
        results.append({"text": texts[i], "sentiment": s, "emotion_topk": e})
    return results

# Example:
for r in predict_both(["I absolutely loved the movie!", "This is frustrating and makes me sad."]):
    print(r)


{'text': 'I absolutely loved the movie!', 'sentiment': {'label': 'positive', 'score': 0.999929666519165}, 'emotion_topk': [{'label': 'joy', 'score': 0.9603076577186584}, {'label': 'love', 'score': 0.022962333634495735}, {'label': 'suprise', 'score': 0.014383501373231411}]}
{'text': 'This is frustrating and makes me sad.', 'sentiment': {'label': 'negative', 'score': 0.9997703433036804}, 'emotion_topk': [{'label': 'sad', 'score': 0.9927501678466797}, {'label': 'anger', 'score': 0.006781178992241621}, {'label': 'fear', 'score': 0.00022206295398063958}]}
