In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset, Dataset, DatasetDict
import torch


In [3]:
# Load the emotion dataset
dataset = load_dataset("emotion")

# Convert dataset splits to pandas DataFrame
train_data = dataset["train"].to_pandas()
validation_data = dataset["validation"].to_pandas() if "validation" in dataset else None
test_data = dataset["test"].to_pandas() if "test" in dataset else None


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
def get_small_subset(data):
    subset_data = data.groupby("label").head(50)
    subset_data = subset_data.reset_index(drop=True)
    return subset_data

small_subset_train_data = get_small_subset(train_data)
small_subset_validation_data = get_small_subset(validation_data) if validation_data is not None else None
small_subset_test_data = get_small_subset(test_data) if test_data is not None else None

# Convert the subset DataFrames back to DatasetDict object
small_subset_train_dataset = Dataset.from_pandas(small_subset_train_data)
small_subset_validation_dataset = Dataset.from_pandas(small_subset_validation_data) if small_subset_validation_data is not None else None
small_subset_test_dataset = Dataset.from_pandas(small_subset_test_data) if small_subset_test_data is not None else None


In [5]:

small_subset_dataset_dict = DatasetDict({
    "train": small_subset_train_dataset,
    "validation": small_subset_validation_dataset,
    "test": small_subset_test_dataset
})

# Preprocess function for tokenization
def preprocess_data(examples):
    """
    Tokenizes the text data.

    Args:
        examples (dict): A dictionary containing text data.

    Returns:
        dict: A dictionary containing tokenized text.
    """
    return tokenizer(examples['text'], padding=True, truncation=True)

In [6]:
# Load the pre-trained model checkpoint
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and encode the dataset
encoded_data = small_subset_dataset_dict.map(preprocess_data, batched=True, batch_size=5568)
encoded_data = encoded_data.remove_columns(['text'])

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [11]:
# Split the dataset into train, validation, and test sets
train_set = encoded_data["train"]
validation_set = encoded_data["validation"] 
test_set = encoded_data["test"] 

# Get the number of labels from the dataset
num_labels = len(set(train_set["label"]))

# Define label-to-ID and ID-to-label dictionaries
label_to_id = {"joy": 0, "sadness": 1, "anger": 2, "fear": 3, "surprise": 4, "disgust": 5}  
id_to_label = {0: "joy", 1: "sadness", 2: "anger", 3: "fear", 4: "surprise", 5: "disgust"}  

In [12]:
# Load the model for sequence classification with the appropriate number of labels
model = (
    AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels, id2label=label_to_id, label2id=id_to_label
    )
    .to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Define the evaluation metric
def compute_evaluation_metrics(predictions):
    labels = predictions.label_ids
    preds = predictions.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Define training arguments
batch_size = 32
learning_rate = 3e-5
num_epochs = 5

In [14]:
training_args = TrainingArguments(
    output_dir="fine-tuned-model",
    num_train_epochs=num_epochs,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_steps=len(train_set) // batch_size,
)

# Create a Trainer instance for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_set,
    eval_dataset=validation_set,
    compute_metrics=compute_evaluation_metrics,
)


In [15]:
# Train the model
trainer.train()

test_results = trainer.evaluate(eval_dataset=test_set)

test_results


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.8156,1.785618,0.203333,0.13125
2,1.7791,1.763431,0.233333,0.189638
3,1.7239,1.730223,0.256667,0.223351
4,1.6842,1.701584,0.246667,0.216023
5,1.6228,1.687967,0.3,0.275915


{'eval_loss': 1.694712519645691,
 'eval_accuracy': 0.31666666666666665,
 'eval_f1': 0.2920252900261943,
 'eval_runtime': 15.5813,
 'eval_samples_per_second': 19.254,
 'eval_steps_per_second': 0.642,
 'epoch': 5.0}

In [16]:
# Example sentences
sentences = ["I feel so happy today!", "This news is really sad."]

# Tokenize the sentences
tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Move the input tensors to the appropriate device
input_ids = tokenized_sentences["input_ids"].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
attention_mask = tokenized_sentences["attention_mask"].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

# Get predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=1)

# Map predicted labels to emotions
predicted_emotions = [id_to_label[label.item()] for label in predicted_labels]

# Print the predicted emotions for each sentence
for sentence, emotion in zip(sentences, predicted_emotions):
    print(f"Sentence: {sentence} \t Predicted Emotion: {emotion}")


Sentence: I feel so happy today! 	 Predicted Emotion: sadness
Sentence: This news is really sad. 	 Predicted Emotion: joy
