<a href="https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning BERT (and friends) for multi-label text classification

In this notebook, we are going to fine-tune BERT to predict one or more labels for a given piece of text. 

In [1]:
unique_labels = ["Cung cấp thông tin", "Tương tác", "Hỏi thông tin giao hàng", "Hỗ trợ, hướng dẫn", "Yêu cầu", "Phản hồi", "Sự vụ", "UNKNOWN"]


In [None]:
import json
from typing import Mapping, Tuple

import numpy as np
import torch
from transformers import AutoTokenizer

class Dataset:
    def __init__(self, json_file: str, tokenizer) -> None:
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        self.data = data
        unique_labels = ["Cung cấp thông tin", "Tương tác", "Hỏi thông tin giao hàng", "Hỗ trợ, hướng dẫn", "Yêu cầu", "Phản hồi", "Sự vụ"]
        self.label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
        self.sep_token = tokenizer.sep_token

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, index: int) -> Tuple[str, int]:
        item = self.data[index]
        history = item["history"]
        current_message = item["current_message"]
        labels = item["label_intent"]

        if history:
            history_text = self.sep_token.join(history)
            context = f"<history>{history_text}</history><current>{current_message}</current>"
        else:
            context = f"<current>{current_message}</current>"

        label_vector = [0] * len(self.label_mapping)
        for label in labels:
            if label in self.label_mapping:
                label_vector[self.label_mapping[label]] = 1

        return context, label_vector

class LlmDataCollator:
    def __init__(self, tokenizer: AutoTokenizer, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: list) -> Mapping[str, torch.Tensor]:
        contexts, labels = zip(*batch)

        contexts_tensor = self.tokenizer(
            contexts,
            max_length=self.max_length,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )

        label_tensor = torch.tensor(np.array(labels), dtype=torch.float)

        return {
            "input_ids": contexts_tensor["input_ids"],
            "attention_mask": contexts_tensor["attention_mask"],
            "labels": label_tensor,
        }

In [None]:
train_data = "dataset/train.json"
val_data = "dataset/val.json"
test_data = "dataset/test.json"


In [None]:
def get_tokenizer(checkpoint: str) -> AutoTokenizer:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    tokenizer.add_special_tokens(
        {'additional_special_tokens': ['<history>', '</history>', '<current>', '</current>']}
    )
    return tokenizer

In [None]:
model_path = "vinai/phobert-base-v2"

In [17]:
tokenizer = get_tokenizer(model_path)
train_set = Dataset(json_file=train_data, tokenizer=tokenizer)
valid_set = Dataset(json_file=val_data, tokenizer=tokenizer)
test_set = Dataset(json_file=test_data, tokenizer=tokenizer)

collator = LlmDataCollator(tokenizer=tokenizer, max_length=256)

In [7]:
id2label = {idx: label for idx, label in enumerate(unique_labels)}
label2id = {label: idx for idx, label in enumerate(unique_labels)}

In [None]:
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(unique_labels),
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                           ignore_mismatched_sizes=True)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at clapAI/roberta-large-multilingual-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([7, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
batch_size = 8
metric_name = "f1"

In [11]:
import wandb

# Đăng nhập bằng cách nhập API key
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jovyan/.netrc


True

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned",
    run_name="finetune-sem-eval-english-v1",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)



We are also going to compute metrics while training. For this, we need to define a `compute_metrics` function, that returns a dictionary with the desired metric values.

In [12]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [20]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_set,
    eval_dataset=valid_set,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)
trainer.train()

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.373098,0.301754,0.586588,0.141176
2,No log,0.342894,0.471178,0.668363,0.3
3,0.363500,0.273281,0.683603,0.799781,0.458824
4,0.363500,0.277648,0.702461,0.817375,0.488235
5,0.363500,0.265737,0.717241,0.82082,0.523529


TrainOutput(global_step=860, training_loss=0.31067572305368824, metrics={'train_runtime': 427.5146, 'train_samples_per_second': 16.023, 'train_steps_per_second': 2.012, 'total_flos': 962133456601296.0, 'train_loss': 0.31067572305368824, 'epoch': 5.0})

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi
username = "Trongdz"
MODEL_NAME = "bert-multi-intent-classification"
api = HfApi(token="Your-token")
api.create_repo(
    repo_id = f"{username}/{MODEL_NAME}",
    repo_type="model"
)
api.upload_folder(
    repo_id = f"{username}/{MODEL_NAME}",
    folder_path = "bert-finetuned",
    repo_type="model"
)

In [None]:
import time
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import precision_score, recall_score
from torch.utils.data import DataLoader
import json

class Tester:
    def __init__(
        self,
        model: torch.nn.Module,
        test_loader: DataLoader,
        output_file: str,
    ) -> None:
        self.test_loader = test_loader
        self.output_file = output_file

        self.loss_fn = nn.BCEWithLogitsLoss()

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)

    def evaluate(self):
        self.model.eval()
        latencies = []
        all_labels = []
        all_preds = []
        total_loss = 0
        results = []

        start_time = time.time()    #throughput
        with torch.no_grad():
            for batch in self.test_loader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)

                batch_start_time = time.time()
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                )
                logits = outputs.logits
                batch_end_time = time.time()
                latency = batch_end_time - batch_start_time
                latencies.append(latency)

                loss = self.loss_fn(logits, labels)
                total_loss += loss.item()

                preds = (torch.sigmoid(logits) > 0.5).float().cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

                for i in range(len(input_ids)):
                    true_label_names = self._map_labels(labels.cpu().numpy()[i], self.model.config.id2label)
                    predicted_label_names = self._map_labels(preds[i], self.model.config.id2label)
                    results.append({
                        "true_labels": true_label_names,
                        "predicted_labels": predicted_label_names,
                        "latency": float(latency),
                    })
        total_time = time.time() - start_time
        num_samples = len(results)


        with open(self.output_file, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=4)
        print(f"Results saved to {self.output_file}")

        self.score(all_labels, all_preds, results)
        self.calculate_latency(latencies)

        throughput = num_samples / total_time
        print(f"num samples: {num_samples}")
        print(f"Throughput: {throughput:.2f} samples/s")

    def _map_labels(self, label_indices: list, labels_mapping: dict) -> list:
        """
        Map label indices to their corresponding names.

        Parameters:
            label_indices: List of binary labels (0 or 1).
            labels_mapping: Dictionary mapping indices to label names.

        Returns:
            List of label names.
        """
        return [labels_mapping[idx] for idx, val in enumerate(label_indices) if val == 1.0]


    def score(self, label: list, predict: list, output: list) -> None:

        precision = precision_score(label, predict, average="weighted", zero_division=0)
        recall = recall_score(label, predict, average="weighted", zero_division=0)
        f1_score = 2 * (precision * recall) / (precision + recall)
        accuracy = self._accuracy(output)

        print(f"Accuracy: {accuracy * 100:.2f}")
        print(f"Precision: {precision * 100:.2f}")
        print(f"Recall: {recall * 100:.2f}")
        print(f"F1 score: {f1_score * 100:.2f}")

    def calculate_latency(self, latencies: list) -> None:
        p99_latency = np.percentile(latencies, 99)
        print(f"P99 Latency: {p99_latency * 1000:.2f} ms")


    def _accuracy(self, output_data: list) -> float:
        """
        Calculate accuracy for multi-label predictions where a sample is correct
        if at least one predicted label matches the true labels.

        Parameters:
            output_data (list): List of dictionaries containing `true_labels` and `predicted_labels`.

        Returns:
            float: Accuracy score.
        """

        correct = 0
        total = len(output_data)

        for sample in output_data:
            true_labels = set(sample["true_labels"])
            predicted_labels = set(sample["predicted_labels"])

            if true_labels & predicted_labels:  # Giao của true_labels và predicted_labels không rỗng
                correct += 1

        return correct / total if total > 0 else 0.0


In [None]:
MODEL = "bert-finetuned/checkpoint-860"
output_file = "output.json"
tuned_model = AutoModelForSequenceClassification.from_pretrained(MODEL)

test_loader = DataLoader(test_set, batch_size=8, shuffle=False, collate_fn=collator)

tester = Tester(model=tuned_model, test_loader=test_loader, output_file=output_file)

tester.evaluate()

Results saved to output.json
Accuracy: 77.33
Precision: 77.29
Recall: 64.89
F1 score: 70.55
P99 Latency: 12.13 ms
num samples: 172
Throughput: 284.93 samples/s
