In [None]:
import torch
from datasets import load_from_disk

In [None]:
from torch import cuda
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

device = "cuda" if cuda.is_available() else "cpu"

In [None]:
ds = load_from_disk("bld/python/TrainTest/TrainTest_data/")

In [None]:
ds

In [None]:
ds["train_dataset"]

In [None]:
# 0/1 change

# Assuming you have a DatasetDict named 'ds'
ds["train_dataset"] = ds["train_dataset"].map(
    lambda example: {"label": [1 if x > 0.5 else 0 for x in example["label"]]},
)
ds["val_dataset"] = ds["val_dataset"].map(
    lambda example: {"label": [1 if x > 0.5 else 0 for x in example["label"]]},
)
ds["test_dataset"] = ds["test_dataset"].map(
    lambda example: {"label": [1 if x > 0.5 else 0 for x in example["label"]]},
)

## Main Part

Fine-tuning
There are two ways we can implement multi-label classification:

- Creating a custom BERT model that overrides the forward method
- Creating a custom Trainer that overrides the compute_loss method

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
# alternative tokenizer


def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)


ds_encoded = ds.map(tokenize, batched=True, batch_size=None)
ds_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
ds_encoded.set_format("torch")
ds_encoded

In [None]:
# def tokenize_and_encode(examples):

Why is the text data gone? I should stay

In [None]:
ds

In [None]:
# error could be here
# ds_enc

In [None]:
ds_encoded

In [None]:
ds_encoded["train_dataset"][0]

In [None]:
# logit model
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        inputs.pop("label")  #  here is the error
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.float().view(-1, self.model.config.num_labels),
        )
        return (loss, outputs) if return_outputs else loss

In [None]:
# Probablisitic Trainer

# class ProbabilisticTrainer(Trainer):
#    def compute_loss(self, model, inputs, return_outputs=False):

# Ensure labels are tensors (if not already)
#        if not isinstance(labels, torch.Tensor):

# Calculate Mean Squared Error (MSE) loss

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=3).to(
    device,
)

In [None]:
# not important first
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True):
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid:
        y_pred = y_pred.sigmoid()
    return ((y_pred > thresh) == y_true.bool()).float().mean().item()

In [None]:
# not important first
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"accuracy_thresh": accuracy_thresh(predictions, label)}

In [None]:
batch_size = 8

args = TrainingArguments(
    output_dir="jigsaw",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [None]:
multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=ds_encoded["train_dataset"],  # this could also be an error
    eval_dataset=ds_encoded["val_dataset"],  # this could also be an error
    tokenizer=tokenizer,
)

In [None]:
ds_encoded["val_dataset"]

Apparently the label is not found in the data?

In [None]:
multi_trainer.evaluate()

In [None]:
# there must be something wrong about the datastructure

In [None]:
multi_trainer.train()

my resource: https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

#Error MEssage

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[30], line 1
----> 1 multi_trainer.evaluate()

File c:\Users\norma\.conda\envs\EN\Lib\site-packages\transformers\trainer.py:2934, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
   2931 start_time = time.time()
   2933 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 2934 output = eval_loop(
   2935     eval_dataloader,
   2936     description="Evaluation",
   2937     # No point gathering the predictions if there are no metrics, otherwise we defer to
   2938     # self.args.prediction_loss_only
   2939     prediction_loss_only=True if self.compute_metrics is None else None,
   2940     ignore_keys=ignore_keys,
   2941     metric_key_prefix=metric_key_prefix,
   2942 )
   2944 total_batch_size = self.args.eval_batch_size * self.args.world_size
   2945 if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:

File c:\Users\norma\.conda\envs\EN\Lib\site-packages\transformers\trainer.py:3123, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
   3120         batch_size = observed_batch_size
   3122 # Prediction step
-> 3123 loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
   3124 inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
...
--> 241     return self.data[item]
    242 elif self._encodings is not None:
    243     return self._encodings[item]

KeyError: 'label'
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

# Other Approach

In [None]:
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import SequenceClassifierOutput

In [None]:
class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(
                logits.view(-1, self.num_labels),
                labels.float().view(-1, self.num_labels),
            )

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss, *output)) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
num_labels = 3
model = BertForMultilabelSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
).to(device)

In [None]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True):
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid:
        y_pred = y_pred.sigmoid()
    return ((y_pred > thresh) == y_true.bool()).float().mean().item()

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"accuracy_thresh": accuracy_thresh(predictions, labels)}

In [None]:
batch_size = 8

args = TrainingArguments(
    output_dir="jigsaw",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_encoded["train_dataset"],  # this could also be an error
    eval_dataset=ds_encoded["val_dataset"],  # this could also be an error
    compute_metrics=compute_metrics,  # not important for problem
    tokenizer=tokenizer,
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()