<a href="https://colab.research.google.com/github/ShahyanAhmedKiani/Ai-Projects/blob/main/Multi_Emotion_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

-------------------------------------------
# ✅ MULTI EMOTION
-------------------------------------------

In [1]:
!pip install transformers datasets scikit-learn torch --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m113.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m877.2 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

-------------------------------------------
# ✅ Import Libraries
-------------------------------------------

In [11]:
import torch
from datasets import load_dataset
from transformers import BertTokenizerFast
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import os

-------------------------------------------
# ✅ Load DATASET
-------------------------------------------

In [12]:
# Load the GoEmotions dataset
dataset = load_dataset("go_emotions")

In [13]:
# Get label names and initialize tokenizer
label_names = dataset['train'].features['labels'].feature.names
num_labels = len(label_names)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

-------------------------------------------
# ✅ Preprocess
-------------------------------------------

In [14]:
# ✅ Preprocessing: convert labels to float values
def preprocess(example):
    encoding = tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)
    labels = [0.0] * num_labels   # Use float
    for label in example['labels']:
        labels[label] = 1.0
    encoding["labels"] = labels
    return encoding

In [15]:
# Apply preprocessing
encoded_dataset = dataset.map(preprocess, batched=False)
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

-------------------------------------------
# ✅ Model and Training Setup
-------------------------------------------


In [16]:


# Disable wandb logging

os.environ["WANDB_DISABLED"] = "true"

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"].shuffle(seed=42).select(range(5000)),
    eval_dataset=encoded_dataset["validation"].select(range(1000)),
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
# ✅ Fix label type inside Trainer
def compute_loss(model, inputs, return_outputs=False,num_items_in_batch=None):
    inputs["labels"] = inputs["labels"].type(torch.float32)
    outputs = model(**inputs)
    loss_fct = torch.nn.BCEWithLogitsLoss()
    loss = loss_fct(outputs.logits, inputs["labels"])
    return (loss, outputs) if return_outputs else loss

trainer.compute_loss = compute_loss
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.1539,0.159386
2,0.1488,0.152346


TrainOutput(global_step=626, training_loss=0.19196335100137388, metrics={'train_runtime': 234.6104, 'train_samples_per_second': 42.624, 'train_steps_per_second': 2.668, 'total_flos': 657931192320000.0, 'train_loss': 0.19196335100137388, 'epoch': 2.0})

-------------------------------------------
# ✅ Metrics
-------------------------------------------

In [18]:

from sklearn.metrics import f1_score, hamming_loss

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.3).int()
    labels = torch.tensor(labels)
    return {
        "micro_f1": f1_score(labels, preds, average='micro'),
        "hamming_loss": hamming_loss(labels, preds)
    }

trainer.compute_metrics = compute_metrics
trainer.evaluate()

{'eval_loss': 0.15234579145908356,
 'eval_micro_f1': 0.3237113402061856,
 'eval_hamming_loss': 0.046857142857142854,
 'eval_runtime': 6.396,
 'eval_samples_per_second': 156.348,
 'eval_steps_per_second': 9.85,
 'epoch': 2.0}

-------------------------------------------
#✅ Predict emotions from custom text
-------------------------------------------

In [19]:

def predict_emotions(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # <-- Ensure eval mode

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits)
        preds = (probs > 0.2).int().squeeze().tolist()

    return [label_names[i] for i, p in enumerate(preds) if p == 1]

# ✅ Example usage
test_text = "I'm feeling so excited and thankful for everything today!"
print("Predicted Emotions:", predict_emotions(test_text))

Predicted Emotions: ['gratitude']


In [20]:
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
classifier("I'm feeling so excited and thankful for everything today!")


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


[[{'label': 'joy', 'score': 0.9814070463180542},
  {'label': 'surprise', 'score': 0.010979062877595425},
  {'label': 'neutral', 'score': 0.0034232111647725105},
  {'label': 'anger', 'score': 0.002112366259098053},
  {'label': 'fear', 'score': 0.0007776639540679753},
  {'label': 'sadness', 'score': 0.00076244433876127},
  {'label': 'disgust', 'score': 0.0005381593364290893}]]

In [22]:
# from transformers import BertTokenizerFast

# # Save model and tokenizer
# model_path = "./goemotions_bert"
# model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)

# # Zip it for download
# import shutil
# shutil.make_archive("goemotions_bert", 'zip', model_path)

# # Download
# from google.colab import files
# files.download("goemotions_bert.zip")
