In [1]:
from datasets import load_dataset

dataset = load_dataset("go_emotions")
train_data = dataset["train"]


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 43410/43410 [00:00<00:00, 602330.05 examples/s]
Generating validation split: 100%|██████████| 5426/5426 [00:00<00:00, 1330505.32 examples/s]
Generating test split: 100%|██████████| 5427/5427 [00:00<00:00, 1075833.62 examples/s]


In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def preprocess(batch):
    encoding = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
    labels = [0] * 28
    for idx in batch["labels"]:
        labels[idx] = 1
    encoding["labels"] = labels
    return encoding

encoded_dataset = train_data.map(preprocess, remove_columns=["text", "labels"], batched=False)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 43410/43410 [00:12<00:00, 3401.60 examples/s]


In [1]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=28,
    problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
)

from sklearn.metrics import f1_score, hamming_loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "hamming_loss": hamming_loss(labels, preds),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset.select(range(1000)),
    compute_metrics=compute_metrics,
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


OSError: bert-base-uncased does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.

In [None]:
def predict_emotions(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).detach().cpu().numpy()[0]
    threshold = 0.5
    predicted = [i for i, p in enumerate(probs) if p > threshold]
    return predicted

label_map = dataset["train"].features["labels"].feature.names
example = "I'm so proud and joyful today!"
predicted_labels = [label_map[i] for i in predict_emotions(example)]
print(predicted_labels)
