In [20]:
#loading the dataset
from datasets import load_dataset
dataset = load_dataset('dair-ai/emotion')

In [21]:
#inspecting dataset
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})
{'text': 'i didnt feel humiliated', 'label': 0}


In [22]:
#preprocessing and tokenization
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [24]:
#tokenization
def tokenize_function(example):
    return tokenizer(example['text'], padding = 'max_length', truncation = True, max_length = 128)

tokenized_dataset = dataset.map(tokenize_function, batched = True)

In [25]:
#loading the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=6  # Match the number of emotion classes
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
dataset["train"].features["label"].names

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

In [14]:
!pip install -U transformers



In [27]:
#training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./emotion-distilbert",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
)

In [28]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [29]:
#the trainer
from transformers import Trainer, TrainingArguments, TrainerCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [31]:
#start finetuing/training a pretrained model
trainer.train()

Step,Training Loss
500,0.1685
1000,0.221
1500,0.1262
2000,0.1155
2500,0.0805
3000,0.0683


TrainOutput(global_step=3000, training_loss=0.1299974733988444, metrics={'train_runtime': 12857.5349, 'train_samples_per_second': 3.733, 'train_steps_per_second': 0.233, 'total_flos': 1589722177536000.0, 'train_loss': 0.1299974733988444, 'epoch': 3.0})

In [13]:
dataset["train"].features["label"]


ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)

In [19]:
import torch
print(torch.__version__)
# should be 2.6.0 or higher


2.5.1


In [32]:
#Saving model
model.save_pretrained("./emotion-distilbert-model")

In [33]:
#Saving tokenizer
tokenizer.save_pretrained("./emotion-distilbert-model")

('./emotion-distilbert-model\\tokenizer_config.json',
 './emotion-distilbert-model\\special_tokens_map.json',
 './emotion-distilbert-model\\vocab.txt',
 './emotion-distilbert-model\\added_tokens.json',
 './emotion-distilbert-model\\tokenizer.json')