### Imports

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true" # to disable usage of API token

from datasets import load_dataset
import numpy as np
import torch
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding, set_seed, EarlyStoppingCallback,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
set_seed(42) #for reproducibility

  from .autonotebook import tqdm as notebook_tqdm
W0529 17:03:59.871000 25248 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


### Load dataset and print its structure

In [2]:
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

print(f"Number of training instances: {len(dataset['train'])}")
print(f"Example of training instance: {dataset['train'][0]}")

Number of training instances: 31232
Example of training instance: {'id': 9536, 'text': 'Cooking microwave pizzas, yummy', 'label': 2, 'sentiment': 'positive'}


### Define model and tokonizer

In [3]:
model_name = "distilbert-base-uncased"
num_labels = 3

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Filter data to not allow None type texts

In [4]:
def filter_valid_text(example):
    return example['text'] is not None

dataset['test'] = dataset['test'].filter(filter_valid_text)


Filter: 100%|██████████| 5206/5206 [00:00<00:00, 81502.99 examples/s]


### Make sure you use CUDA if avaliable

In [5]:
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 2070


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Training process and evaluation (IMPROTANT: THE EVALUATION IS NOT ON THE FINAL TEST DATASET)

In [6]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.1,
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

print("Evaluating on test set:")
test_results = trainer.evaluate(tokenized_datasets["test"])
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

Map: 100%|██████████| 31232/31232 [00:02<00:00, 14558.50 examples/s]
Map: 100%|██████████| 5205/5205 [00:00<00:00, 16713.56 examples/s]
Map: 100%|██████████| 5205/5205 [00:00<00:00, 11208.72 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  5%|▌         | 500/9760 [02:00<37:22,  4.13it/s]

{'loss': 0.6964, 'grad_norm': 4.78527307510376, 'learning_rate': 1.8975409836065574e-05, 'epoch': 0.51}


                                                  
 10%|█         | 976/9760 [04:12<36:46,  3.98it/s]

{'eval_loss': 0.5888040065765381, 'eval_accuracy': 0.7513928914505283, 'eval_f1': 0.7524737834123425, 'eval_precision': 0.759790256753315, 'eval_recall': 0.7513928914505283, 'eval_runtime': 14.7666, 'eval_samples_per_second': 352.484, 'eval_steps_per_second': 11.038, 'epoch': 1.0}


 10%|█         | 1000/9760 [04:19<38:16,  3.81it/s]  

{'loss': 0.6016, 'grad_norm': 5.0308837890625, 'learning_rate': 1.795081967213115e-05, 'epoch': 1.02}


 15%|█▌        | 1500/9760 [06:33<37:03,  3.71it/s]

{'loss': 0.5081, 'grad_norm': 3.0630950927734375, 'learning_rate': 1.6926229508196722e-05, 'epoch': 1.54}


                                                   
 20%|██        | 1952/9760 [08:49<34:45,  3.74it/s]

{'eval_loss': 0.5711972117424011, 'eval_accuracy': 0.7617675312199808, 'eval_f1': 0.7624628660887482, 'eval_precision': 0.7637008806225252, 'eval_recall': 0.7617675312199808, 'eval_runtime': 14.5838, 'eval_samples_per_second': 356.902, 'eval_steps_per_second': 11.177, 'epoch': 2.0}


 20%|██        | 2000/9760 [09:04<34:29,  3.75it/s]   

{'loss': 0.4879, 'grad_norm': 5.2484846115112305, 'learning_rate': 1.5901639344262295e-05, 'epoch': 2.05}


 26%|██▌       | 2500/9760 [11:18<32:21,  3.74it/s]

{'loss': 0.397, 'grad_norm': 7.1436262130737305, 'learning_rate': 1.4877049180327869e-05, 'epoch': 2.56}


                                                   
 30%|███       | 2928/9760 [13:26<30:22,  3.75it/s]

{'eval_loss': 0.6225201487541199, 'eval_accuracy': 0.7552353506243996, 'eval_f1': 0.7547927585710251, 'eval_precision': 0.7544927610323205, 'eval_recall': 0.7552353506243996, 'eval_runtime': 14.6865, 'eval_samples_per_second': 354.407, 'eval_steps_per_second': 11.099, 'epoch': 3.0}


 31%|███       | 3000/9760 [13:48<30:11,  3.73it/s]   

{'loss': 0.3904, 'grad_norm': 10.01522159576416, 'learning_rate': 1.3852459016393445e-05, 'epoch': 3.07}


 36%|███▌      | 3500/9760 [16:02<27:47,  3.75it/s]

{'loss': 0.2975, 'grad_norm': 9.432467460632324, 'learning_rate': 1.2827868852459017e-05, 'epoch': 3.59}


                                                   
 40%|████      | 3904/9760 [18:05<25:54,  3.77it/s]

{'eval_loss': 0.7500736117362976, 'eval_accuracy': 0.753314121037464, 'eval_f1': 0.7529401159490584, 'eval_precision': 0.7528709956108933, 'eval_recall': 0.753314121037464, 'eval_runtime': 14.7315, 'eval_samples_per_second': 353.325, 'eval_steps_per_second': 11.065, 'epoch': 4.0}


 40%|████      | 3904/9760 [18:07<27:11,  3.59it/s]


{'train_runtime': 1087.4862, 'train_samples_per_second': 287.194, 'train_steps_per_second': 8.975, 'train_loss': 0.4631987419284758, 'epoch': 4.0}
Evaluating on test set:


100%|██████████| 163/163 [00:14<00:00, 11.00it/s]

eval_loss: 0.5788
eval_accuracy: 0.7675
eval_f1: 0.7676
eval_precision: 0.7678
eval_recall: 0.7675
eval_runtime: 15.0157
eval_samples_per_second: 346.6360
eval_steps_per_second: 10.8550
epoch: 4.0000





### Load the model from the checkpoint if needed and evaluate it on FINAL DATASET

In [7]:
modelpath = "bert-finetuned/checkpoint-3904" #change if needed
model = AutoModelForSequenceClassification.from_pretrained(
    modelpath,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

df = pd.read_csv("../test-datasets/sentiment-topic-test.tsv", sep="\t")
sentences = df["sentence"].tolist()
true_labels = df["sentiment"].tolist()

model.eval()
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

label_map = {0: "negative", 1: "neutral", 2: "positive"}
inverse_label_map = {v: k for k, v in label_map.items()}
predicted_labels = [label_map[p.item()] for p in predictions]
true_label_ids = [inverse_label_map[label] for label in true_labels]

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels, labels=["negative", "neutral", "positive"]))

for index in range(len(sentences)):
    print(f"Sentence: {sentences[index]}, true label: {true_labels[index]}, predicted labels: {predicted_labels[index]}")



Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.67      0.73         6
     neutral       0.67      0.67      0.67         6
    positive       0.57      0.67      0.62         6

    accuracy                           0.67        18
   macro avg       0.68      0.67      0.67        18
weighted avg       0.68      0.67      0.67        18


Confusion Matrix:
[[4 1 1]
 [0 4 2]
 [1 1 4]]
Sentence: The stadium was alive with the roar of the crowd after that incredible win., true label: positive, predicted labels: positive
Sentence: That last-minute goal had me jumping out of my seat—what an unbelievable finish!, true label: positive, predicted labels: positive
Sentence: I couldn’t put the book down; it swept me into a whole new world., true label: positive, predicted labels: negative
Sentence: The story had its moments, though some parts felt like they dragged on a bit., true label: neutral, predicted labels: neutral
Sentence: 