In [None]:
# Install required libraries
!pip install transformers datasets torch accelerate evaluate

In [None]:
# Load dataset from Hugging Face
from datasets import load_dataset

ds = load_dataset("UniqueData/customers-reviews-on-banks")
ds

In [None]:
# Create sentiment labels from star ratings
df = ds["train"].to_pandas()

def map_sentiment(rating):
  if rating in [1, 2]:
    return "Negative"
  elif rating == 3:
    return "Neutral"
  else:
    return "Positive"
df["sentiment"] = df["star"].apply(map_sentiment)

df.head()

In [None]:
# Turn sentiment strings into numeric labels
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
df["label"] = df["sentiment"].map(label_map)

df[["star", "sentiment", "label"]].head()

In [None]:
# Check mapping
df["sentiment"].value_counts(normalize=True) * 100
df["label"].value_counts(normalize=True) * 100

In [None]:
# Compute class weights for imbalance
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.array([0,1,2])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=df["label"]
)

class_weights

In [None]:
# Convert dataframe back to huggingface dataset
from datasets import Dataset

hf_dataset = Dataset.from_pandas(df)

hf_dataset

In [None]:
# Train/validation split
train_valid = hf_dataset.train_test_split(test_size=0.2, seed=42)
train_ds = train_valid["train"]
valid_ds = train_valid["test"]

train_ds, valid_ds

In [None]:
# Tokenize for BERT
from transformers import BertTokenizerFast

model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

max_length = 128

def tokenize(batch):
    texts = ["" if t is None else str(t) for t in batch["text"]]
    labels = [int(l) for l in batch["label"]]

    tokens = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

    tokens["labels"] = labels
    return tokens

tokenized_train = train_ds.map(tokenize, batched=True)
tokenized_valid = valid_ds.map(tokenize, batched=True)

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_valid.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Create BERT model with class-weighted loss
from sklearn.utils.class_weight import compute_class_weight

classes = np.array([0, 1, 2])  # Negative, Neutral, Positive

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=df["label"]
)

print("Class weights:", dict(zip(classes, class_weights)))

In [None]:

import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support

num_labels = 3
model_name = "bert-base-uncased"

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# put class weights on the same device as the model (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
cw = torch.tensor(class_weights, dtype=torch.float, device=device)

# ---- metrics: precision, recall, F1 (macro) ----
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="macro",
        zero_division=0
    )

    return {
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1
    }

# ---- custom Trainer that applies class-weighted loss ----
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = torch.nn.CrossEntropyLoss(weight=cw)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ---- training args ----
batch_size = 16

training_args = TrainingArguments(
    output_dir="./bert-banks-sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support

num_labels = 3
model_name = 'bert-base-uncased'

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Put class weights on same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
cw = torch.tensor(class_weights, dtype=torch.float, device=device)

# Metrics: precision, recall, F1 (macro)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="macro",
        zero_division=0
    )

    return {
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1
    }

# Custom trainer that applies class-weighted loss
class WeightedTrainer(Trainer):
    def compute_loss(self,
                     model,
                     inputs,
                     return_outputs=False,
                     num_items_in_batch=None,
                     **kwargs
    ):

        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = torch.nn.CrossEntropyLoss(weight=cw)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Training args
batch_size = 16

training_args = TrainingArguments(
    output_dir="./bert-banks-sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to="none",
    push_to_hub=False,

)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.evaluate()

In [None]:
from sklearn.metrics import classification_report, f1_score

preds = trainer.predict(tokenized_valid)
y_pred = preds.predictions.argmax(axis=-1)
y_true = preds.label_ids

# Macro F1 headline number
macro_f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
print("Macro F1:", macro_f1)

# Full report
print(
    classification_report(
        y_true,
        y_pred,
        target_names=["Negative", "Neutral", "Positive"],
        zero_division=0
    )
)

In [None]:
!pip install seaborn
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Define class labels
labels = ['Negative', 'Neutral', 'Positive']

sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels,
    annot_kws={"size": 11, "weight": "bold"},
    linewidths=0.7,
    square=True,
    cbar_kws={"shrink": 0.8, "label": "Number of Reviews"}
)

# Define title, labels, ticks and cbar
plt.title("Confusion Matrix - BERT", fontsize=14, weight='bold', pad=35)
plt.xlabel("Predicted", fontsize=12, labelpad =12)
plt.ylabel("Actual", fontsize=12, labelpad=12, rotation=0, ha='right')
plt.xticks(fontsize=11)
plt.yticks(rotation=0, fontsize=11)


# Make colourbar label horizontal
cbar = plt.gcf().axes[-1]
cbar.set_ylabel("Number of Reviews", rotation=0, labelpad=60, fontsize=11)
cbar.yaxis.set_label_position('right')

plt.gca().set_facecolor("#F7F9FB")
plt.tight_layout()
plt.show()