In [3]:
!pip install -q transformers datasets torch evaluate scikit-learn


In [4]:
import torch
import numpy as np

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import hamming_loss, f1_score


In [5]:
# Load the GoEmotions multi‑label emotion dataset
#    “simplified” gives you the 27‑class version that’s easiest to start with
dataset = load_dataset("go_emotions", "simplified")

# Peek at train/validation splits and a sample
print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})
{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


In [6]:
# Tokenize the texts with a pretrained tokenizer
from transformers import AutoTokenizer

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# simple preprocessing fn to apply to every split
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# apply to train/validation/test
encoded_ds = dataset.map(tokenize_batch, batched=True)

# drop the raw text column (we’ll use input_ids, attention_mask)
encoded_ds = encoded_ds.remove_columns("text")


Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

In [7]:
# ——— Load the HF emotion dataset into raw_datasets ———
from datasets import load_dataset

raw_datasets = load_dataset("emotion")


In [22]:
# ——— 0) Figure out how many emotion labels we have ———
label_list = raw_datasets["train"].features["label"].names
num_labels = len(label_list)
print(f"{num_labels} emotion classes:", label_list)


6 emotion classes: ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "bert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# ——— 2) Tokenize all splits ———
def tokenize_batch(batch):
    return tokenizer(
        batch["text"],            # field in raw_datasets
        padding="max_length",     # pad all to same length
        truncation=True,          # cut off long texts
        max_length=128            # you can lower this if you need speed
    )

# apply to train/validation/test
encoded_ds = raw_datasets.map(tokenize_batch, batched=True)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [12]:
print(encoded_ds["train"].column_names)


['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [1]:
# ─── Quick Smoke‑Test: DistilBERT on 2k samples ───
!pip install -q transformers datasets torch sklearn

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, hamming_loss

# 1) Load tiny emotion dataset
raw = load_dataset("emotion")

# 2) Figure out labels
label_list = raw["train"].features["label"].names
num_labels = len(label_list)

# 3) Tokenize
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
def tok(b): return tokenizer(b["text"], padding="max_length", truncation=True, max_length=128)
enc = raw.map(tok, batched=True).rename_column("label","labels").remove_columns("text")
enc.set_format("torch", ["input_ids","attention_mask","labels"])

# 4) Small subsets
train_small = enc["train"].shuffle(seed=42).select(range(2000))
val_small   = enc["validation"].shuffle(seed=42).select(range(500))

# 5) Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels
)

# 6) Training args (no W&B, 1 epoch)
args = TrainingArguments(
    output_dir="quick-run",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    logging_steps=50,
    save_steps=100,
    report_to="none"
)

# 7) Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=-1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1_macro": f1_score(p.label_ids, preds, average="macro"),
        "hamming":   hamming_loss(p.label_ids, preds)
    }

# 8) Trainer & train
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_small,
    eval_dataset=val_small,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.evaluate()


  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,1.4259


{'eval_loss': 1.1871280670166016,
 'eval_accuracy': 0.56,
 'eval_f1_macro': 0.23150183150183148,
 'eval_hamming': 0.44,
 'eval_runtime': 87.7707,
 'eval_samples_per_second': 5.697,
 'eval_steps_per_second': 0.091,
 'epoch': 1.0}

In [13]:
test_metrics = trainer.evaluate(encoded_ds["test"])
print(test_metrics)


{'eval_loss': 1.1147234439849854, 'eval_accuracy': 0.596, 'eval_f1_macro': 0.2450532701687055, 'eval_hamming': 0.404, 'eval_runtime': 354.2298, 'eval_samples_per_second': 5.646, 'eval_steps_per_second': 0.09, 'epoch': 1.0}


In [14]:
model.save_pretrained("my_emotion_model")
tokenizer.save_pretrained("my_emotion_model")


('my_emotion_model/tokenizer_config.json',
 'my_emotion_model/special_tokens_map.json',
 'my_emotion_model/vocab.txt',
 'my_emotion_model/added_tokens.json',
 'my_emotion_model/tokenizer.json')

In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

tok = AutoTokenizer.from_pretrained("my_emotion_model")
mdl = AutoModelForSequenceClassification.from_pretrained("my_emotion_model")

def predict_emotions(text):
    inputs = tok(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        logits = mdl(**inputs).logits
    probs = torch.sigmoid(logits).squeeze().tolist()  # multi‑label
    return {label_list[i]: float(probs[i]) for i in range(len(label_list))}

print(predict_emotions("I’m so excited for today!"))


{'sadness': 0.5237559676170349, 'joy': 0.4476841986179352, 'love': 0.4250728189945221, 'anger': 0.4714941382408142, 'fear': 0.5202828049659729, 'surprise': 0.46444186568260193}
