In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q transformers datasets scikit-learn

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import Counter

from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup
)

from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    ConfusionMatrixDisplay
)

In [None]:
from datasets import load_dataset

ds = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

train_data = ds["train"]
test_data  = ds["test"]

print(train_data[0])


In [None]:
label_to_text = {}

for example in train_data:
    label_to_text[example["label"]] = example["sentiment"]

print(label_to_text)

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

labels = train_data["label"]
counts = Counter(labels)

plt.bar(counts.keys(), counts.values())
plt.title("Class Distribution — Training Set")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased"
)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __getitem__(self, idx):
        text = self.texts[idx] if isinstance(self.texts[idx], str) else ""

        encoding = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=256,   # ← recommended
            return_tensors='pt'
        )

        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = NewsDataset(
    list(train_data["text"]),
    list(train_data["label"])
)

test_dataset = NewsDataset(
    list(test_data["text"]),
    list(test_data["label"])
)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

labels = list(train_data["label"])

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

print("Class weights:", class_weights)

In [None]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss(weight=class_weights)

In [None]:
num_labels = len(set(train_data["label"]))

from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)

In [None]:
import torch
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 3
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs.logits, labels)
        loss.backward()

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")

In [None]:
model.eval()

preds = []
true = []

with torch.no_grad():
    for batch in test_loader:

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=1)

        preds.extend(predictions.cpu().numpy())
        true.extend(labels.cpu().numpy())

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

acc = accuracy_score(true, preds)

precision, recall, f1, _ = precision_recall_fscore_support(
    true, preds, average="weighted"
)

print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

ConfusionMatrixDisplay.from_predictions(true, preds)
plt.show()

In [None]:
def predict_text(text: str):
    model.eval()

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)

    confidence, pred = torch.max(probs, dim=1)

    label_name = label_to_text[pred.item()]

    return label_name, float(confidence.item())

In [None]:
examples = [
    "I absolutely loved this product!",
    "This was the worst experience ever.",
    "It was okay, nothing special.",
    "Amazing performance and great value."
]

for text in examples:
    label, conf = predict_text(text)
    print(f"Text: {text}")
    print(f"Predicted Label: {label}, Confidence: {conf:.4f}\n")