# **Classification**

In this notebook, we fine-tune a [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#xlm-roberta) model for news genre classification: opinion, report, or satire. Afterwards, the model is evaluated using macro-precision, macro-recall, and macro-F1. Also, the confusion matrix will be plotted.

XML-RoBERTa is a multilingual masked language model trained over 2.5TB of filtered CommonCrawl data across 100 languages. The tokenizer does not need to know the language, because the tokens' ID includes that information, thus it can be used for classification without an additional setup.

In [None]:
!pip install --upgrade transformers accelerate bitsandbytes

In [None]:
!wget https://raw.githubusercontent.com/VladWero08/mt-pattern-preserve/refs/heads/main/data/articles_en.csv -O articles_en.csv

In [None]:
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from collections import Counter
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification, BitsAndBytesConfig, TrainingArguments, Trainer
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

## **Model**

In [None]:
# mapping
class_names = ["opinion", "reporting", "satire"]
id2label = {0: "opinion", 1: "reporting", 2: "satire"}
label2id = {v: k for k, v in id2label.items()}
label2onehot = {
    "opinion": [1, 0, 0],
    "reporting": [0, 1, 0],
    "satire": [0, 0, 1],
  }

# load tokenizer
model_id = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# load model
model = XLMRobertaForSequenceClassification.from_pretrained(
    model_id,
    num_labels=len(id2label),
    problem_type="multi_label_classification",
)
# update config
model.config.id2label = id2label
model.config.label2id = label2id


In [None]:
text = "This is a sports article. Manchester United won 3-0 last night against Real Madrid."
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
  logits = model(**inputs).logits
  prediction_id = logits.argmax().item()
  prediction_label = model.config.id2label[prediction_id]

print(f"{text} --- prediction --> {prediction_label}")

## **Dataset**

In [None]:
articles = pd.read_csv("articles_en.csv")

In [None]:
class_weights = torch.Tensor([
    len(articles) / 3 * articles["genre"].value_counts().iloc[0].item(),
    len(articles) / 3 * articles["genre"].value_counts().iloc[1].item(),
    len(articles) / 3 * articles["genre"].value_counts().iloc[2].item(),
])

In [None]:
class NewsGenreDataset(torch.utils.data.Dataset):
  def __init__(self, articles: list[str], labels: list[int]) -> None:
    global tokenizer

    self.encodings = tokenizer(
        articles,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    self.labels = torch.tensor(labels, dtype=torch.float32)

  def __getitem__(self, idx: int) -> dict:
    item = {key: val[idx] for key, val in self.encodings.items()}
    item["labels"] = self.labels[idx]
    return item

  def __len__(self) -> int:
    return len(self.labels)

In [None]:
dataset = NewsGenreDataset(
    [article for article in articles["full_articles"].values],
    [label2onehot[genre] for genre in articles["genre"].values],
)

In [None]:
# split the dataset into train and validation
train_idxs, validation_idxs = train_test_split(range(len(dataset)), test_size=0.1, random_state=42)
train_dataset = torch.utils.data.Subset(dataset, train_idxs)
validation_dataset = torch.utils.data.Subset(dataset, validation_idxs)

## **Training**

In [None]:
class ClassWeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
      global class_weights

      labels = inputs.get("labels")
      # forward pass
      outputs = model(**inputs)
      logits = outputs.get('logits')
      # compute custom loss
      loss_fct = nn.CrossEntropyLoss(weight=class_weights)
      loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
      return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    num_train_epochs=3,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    learning_rate=2e-5,
    weight_decay=1e-4,
)

In [None]:
trainer = ClassWeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.Subset(train_dataset, range(5)), # Fixed: Create a Subset to maintain Dataset type
    eval_dataset=torch.utils.data.Subset(train_dataset, range(2)),
    tokenizer=tokenizer,
)
trainer.train()

## **Evaluation**

In [None]:
# compute the prediction logits
val_predictions = trainer.predict(validation_dataset)
# compute the prediction classes
val_predictions_labels = torch.argmax(torch.tensor(val_predictions.predictions), dim=1)
# extract the true labels from the validation dataset
val_true_labels = torch.argmax(dataset[validation_dataset.indices]["labels"], dim=1)

In [None]:
# compute metrics
precision = precision_score(val_true_labels, val_predictions_labels, average="macro")
recall = recall_score(val_true_labels, val_predictions_labels, average="macro")
f1 = f1_score(val_true_labels, val_predictions_labels, average="macro")
cm = confusion_matrix(val_true_labels, val_predictions_labels)


In [None]:
print("Metrics - average = macro")
print("-------------------------")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {f1:.4f}")

In [None]:
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()