In [1]:
from datasets import load_dataset

dataset = load_dataset("go_emotions")

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})


In [3]:
dataset = dataset.remove_columns("id")

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 5427
    })
})


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

In [25]:
def assemble_function(example):
    inputs = tokenizer(example["text"], max_length=512 , truncation=True, padding="max_length", return_tensors="pt")
    return {"input_ids" : inputs["input_ids"]}

In [26]:
dataset = dataset.map(assemble_function, batched=True)

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [27]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 5427
    })
})


In [16]:
def single_label_fn(example):
    return {"labels" : example["labels"][0]}

In [19]:
dataset = dataset.map(single_label_fn)

In [28]:
dataset["train"]["labels"]

[27,
 27,
 2,
 14,
 3,
 26,
 15,
 8,
 0,
 27,
 6,
 1,
 27,
 5,
 3,
 3,
 15,
 2,
 27,
 6,
 6,
 12,
 27,
 27,
 27,
 2,
 27,
 16,
 15,
 27,
 2,
 6,
 27,
 2,
 6,
 17,
 27,
 0,
 25,
 27,
 0,
 15,
 16,
 27,
 7,
 10,
 20,
 27,
 27,
 27,
 27,
 27,
 4,
 27,
 13,
 10,
 27,
 27,
 27,
 15,
 0,
 12,
 27,
 13,
 27,
 0,
 27,
 1,
 27,
 0,
 27,
 0,
 3,
 27,
 27,
 27,
 0,
 0,
 27,
 1,
 13,
 4,
 25,
 4,
 27,
 25,
 0,
 9,
 4,
 27,
 4,
 27,
 24,
 18,
 4,
 27,
 7,
 27,
 7,
 27,
 0,
 3,
 10,
 27,
 27,
 5,
 27,
 6,
 27,
 15,
 27,
 0,
 22,
 27,
 17,
 27,
 2,
 2,
 27,
 27,
 9,
 4,
 27,
 3,
 2,
 26,
 7,
 2,
 27,
 0,
 15,
 27,
 11,
 27,
 27,
 9,
 7,
 22,
 27,
 2,
 3,
 13,
 27,
 27,
 13,
 9,
 15,
 23,
 1,
 0,
 1,
 15,
 7,
 10,
 27,
 27,
 0,
 14,
 18,
 27,
 4,
 27,
 3,
 27,
 5,
 27,
 0,
 27,
 14,
 6,
 18,
 27,
 20,
 27,
 27,
 0,
 27,
 0,
 17,
 27,
 26,
 7,
 15,
 0,
 0,
 10,
 27,
 17,
 27,
 7,
 4,
 2,
 3,
 4,
 0,
 27,
 27,
 11,
 27,
 18,
 27,
 7,
 7,
 9,
 17,
 10,
 27,
 15,
 0,
 27,
 4,
 18,
 27,
 6,
 27,
 1,
 27,
 

In [24]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids'],
        num_rows: 5427
    })
})


In [9]:
import evaluate

accuracy = evaluate.load("accuracy")

In [10]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred,
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predicions=predictions, references=labels)

In [11]:
print(len(dataset["train"][0]["labels"]))

1


In [15]:
dataset["train"]["labels"][0]

[27]

In [21]:
id2label = {0 : "admiration", 1 : "amusement", 2 : "anger", 3 : "annoyance", 4 : "approval", 5 : "caring", 6 : "confusion", 7 : "curiosity", 8 : "desire",
    9 : "disappointment", 10 : "disapproval", 11 : "disgust", 12 : "embarrassment", 13 : "excitement", 14 : "fear", 15 : "gratitude", 16 : "grief",
    17 : "joy", 18 : "love", 19 : "nervousness", 20 : "optimism", 21 : "pride", 22 : "realization", 23 : "relief", 24 : "remorse", 25 : "sadness", 26 : "surprise",
    27 :"neutral"}

label2id = {label : id for id, label in id2label.items()}

In [22]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2", num_labels=len(id2label), id2label=id2label, label2id=label2id)




Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir = "output",
    learning_rate=1e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs=1,
    weight_decay=0.01, # lambda in regularizationn
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset = dataset["train"],
    eval_dataset = dataset["validation"],
    processing_class = tokenizer,
    compute_metrics = compute_metrics,
)

trainer.train()

In [30]:
text = "i am sad am i not"

In [None]:
from transformers import pipeline

classifier = pipeline("test_classification", model="output")
classifier(text)