Sentiment analysis

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

from datasets import load_dataset
import numpy as np
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding, set_seed, EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
set_seed(42)

In [2]:
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

print(f"Number of training instances: {len(dataset['train'])}")
print(f"Example of training instance: {dataset['train'][0]}")

Number of training instances: 31232
Example of training instance: {'id': 9536, 'text': 'Cooking microwave pizzas, yummy', 'label': 2, 'sentiment': 'positive'}


In [3]:
model_name = "distilbert-base-uncased"
num_labels = 3

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def filter_valid_text(example):
    return example['text'] is not None

dataset['test'] = dataset['test'].filter(filter_valid_text)


In [5]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 2070


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [7]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Train the model (validation accuracy will be printed automatically)
trainer.train()

# Evaluate on test set after training and print results
print("Evaluating on test set:")
test_results = trainer.evaluate(tokenized_datasets["test"])
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6977,0.586345,0.752738,0.753085,0.756602,0.752738
2,0.5071,0.571584,0.763881,0.764666,0.76608,0.763881
3,0.3972,0.628548,0.758501,0.75781,0.757525,0.758501
4,0.2971,0.748657,0.749087,0.74999,0.751618,0.749087


Evaluating on test set:


eval_loss: 0.5771
eval_accuracy: 0.7681
eval_f1: 0.7685
eval_precision: 0.7691
eval_recall: 0.7681
eval_runtime: 20.1846
eval_samples_per_second: 257.8700
eval_steps_per_second: 8.0750
epoch: 4.0000
