Sentiment analysis

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

from datasets import load_dataset
import numpy as np
import torch
import pandas as pd
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding, set_seed, EarlyStoppingCallback,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm
W0526 14:59:05.074000 3332 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

print(f"Number of training instances: {len(dataset['train'])}")
print(f"Example of training instance: {dataset['train'][0]}")

Number of training instances: 31232
Example of training instance: {'id': 9536, 'text': 'Cooking microwave pizzas, yummy', 'label': 2, 'sentiment': 'positive'}


In [3]:
model_name = "distilbert-base-uncased"
num_labels = 3

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
def filter_valid_text(example):
    return example['text'] is not None

dataset['test'] = dataset['test'].filter(filter_valid_text)


NameError: name 'dataset' is not defined

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 2070


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    evaluation_strategy="epoch", 
    save_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.1,
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

print("Evaluating on test set:")
test_results = trainer.evaluate(tokenized_datasets["test"])
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6977,0.586045,0.754467,0.754863,0.758025,0.754467
2,0.5076,0.571542,0.76292,0.763687,0.765031,0.76292
3,0.3979,0.6274,0.759462,0.758869,0.758618,0.759462
4,0.2984,0.745823,0.748895,0.749793,0.751434,0.748895


Evaluating on test set:


eval_loss: 0.5770
eval_accuracy: 0.7664
eval_f1: 0.7668
eval_precision: 0.7674
eval_recall: 0.7664
eval_runtime: 20.1520
eval_samples_per_second: 258.2860
eval_steps_per_second: 8.0890
epoch: 4.0000


In [None]:
modelpath = "bert-finetuned/checkpoint-3904" #cahnge if needed
model = AutoModelForSequenceClassification.from_pretrained(
    modelpath,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

df = pd.read_csv("../test-datasets/sentiment-topic-test.tsv", sep="\t")
sentences = df["sentence"].tolist()
true_labels = df["sentiment"].tolist()

model.eval()
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

label_map = {0: "negative", 1: "neutral", 2: "positive"}
inverse_label_map = {v: k for k, v in label_map.items()}
predicted_labels = [label_map[p.item()] for p in predictions]
true_label_ids = [inverse_label_map[label] for label in true_labels]

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels, labels=["negative", "neutral", "positive"]))

for index in range(len(sentences)):
    print(f"Sentence: {sentences[index]}, true label: {true_labels[index]}, predicted labels: {predicted_labels[index]}")

NameError: name 'AutoModelForSequenceClassification' is not defined