In [None]:
seed = 42
import random

import torch

random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

- 'eval_loss': 0.6905103921890259: This is the evaluation loss. It represents a measure of how well the model's predictions match the true labels in the validation dataset. Lower values indicate better performance.

- 'eval_accuracy_thresh': 0.46666666865348816: This is the accuracy threshold. It indicates the threshold value used for binary classification. Predictions with a probability greater than or equal to this threshold are classified as the positive class, and those below it are classified as the negative class. In this case, the threshold is set to approximately 0.467.

- 'eval_runtime': 0.3652: This is the time it took to perform the evaluation in seconds. It measures how long it took to evaluate the model on the validation dataset.

- 'eval_samples_per_second': 27.384: This is the number of validation samples processed per second during evaluation. It indicates the evaluation speed.

- 'eval_steps_per_second': 5.477: This is the number of evaluation steps (batches) processed per second. In some cases, evaluation is performed in batches to make it more memory-efficient and faster.

In [None]:
import torch
from datasets import load_from_disk

In [None]:
from torch import cuda
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

device = "cuda" if cuda.is_available() else "cpu"

In [None]:
ds = load_from_disk("bld/python/TrainTest/TrainTest_data/")

In [None]:
ds

In [None]:
ds["train_dataset"][0]

In [None]:
def zero_one_translation(dataset):
    """Translate the labels to 0 and 1."""
    return dataset.map(
        lambda example: {"label": [1 if x > 0.5 else 0 for x in example["label"]]},
    )

In [None]:
# 0/1 change
# Argument for this: 0/1 is harder on the model being able to predict the extreme cases. A continuous approach would be better on the not so hard cases but this seems less interesting, also I assume my zero-shot to be true for my model
# Assuming you have a DatasetDict named 'ds'
ds["train_dataset"] = ds["train_dataset"].map(
    lambda example: {"label": [1 if x > 0.5 else 0 for x in example["label"]]},
)
ds["val_dataset"] = ds["val_dataset"].map(
    lambda example: {"label": [1 if x > 0.5 else 0 for x in example["label"]]},
)
ds["test_dataset"] = ds["test_dataset"].map(
    lambda example: {"label": [1 if x > 0.5 else 0 for x in example["label"]]},
)

In [None]:
ds

## Main Part

Fine-tuning
There are two ways we can implement multi-label classification:

- Creating a custom BERT model that overrides the forward method
- Creating a custom Trainer that overrides the compute_loss method

In [None]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
# alternative tokenizer


def tokenize(batch):
    return tokenizer(batch["sequence"], padding=True, truncation=True)


ds_encoded = ds.map(tokenize, batched=True, batch_size=None)
ds_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"],
)
ds_encoded.set_format("torch")
ds_encoded

my resource: https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

# Other Approach

In [None]:
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import SequenceClassifierOutput

In [None]:
# specify a model (Task 1 Exercise 9)
class BertForMultilabelSequenceClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(
                logits.view(-1, self.num_labels),
                labels.float().view(-1, self.num_labels),
            )

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss, *output)) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
num_labels = 3
model = BertForMultilabelSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
).to(device)

### accuracy and so on

In [None]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True):
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid:
        y_pred = y_pred.sigmoid()
    return ((y_pred > thresh) == y_true.bool()).float().mean().item()

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"accuracy_thresh": accuracy_thresh(predictions, labels)}

### actual part resumed

In [None]:
batch_size = 8

args = TrainingArguments(
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_encoded["train_dataset"],  # this could also be an error
    eval_dataset=ds_encoded["val_dataset"],  # this could also be an error
    compute_metrics=compute_metrics,  # not important for problem
    tokenizer=tokenizer,
)

In [None]:
ds_encoded

In [None]:
ds_encoded["train_dataset"]["label"]

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

### not important addiditon

In [None]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.float().view(-1, self.model.config.num_labels),
        )
        return (loss, outputs) if return_outputs else loss

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
).to(device)

In [None]:
multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=ds_encoded["train_dataset"],  # this could also be an error
    eval_dataset=ds_encoded["val_dataset"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
multi_trainer.evaluate()

In [None]:
multi_trainer.train()

### use the last hidden stat for pics

In [None]:
def extract_states(batch, model):
    input_ids = torch.tensor(batch["input_ids"])
    attention_mask = torch.tensor(batch["attention_mask"])

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        lhs = output.last_hidden_state.cpu().numpy()

    valid = np.array(batch["attention_mask"]).astype(bool)

    batch_size, n_tokens, hidden_dim = lhs.shape
    valid = valid.reshape(batch_size, n_tokens, 1).repeat(hidden_dim, axis=-1)

    masked_mean = np.ma.array(lhs, mask=~valid).mean(axis=1).data

    batch["hidden_state"] = masked_mean
    return batch

In [None]:
batch = ds_encoded["train_dataset"][:2]

In [None]:
extract_states(batch, model_ckpt)  # error here

In [None]:
last_states = emotions_encoded.map(
    extract_states,
    batched=True,
    batch_size=1000,
    fn_kwargs={"model": model},
)

In [None]:
ds_encoded

In [None]:
X_train = np.array(last_states["train_dataset"]["hidden_state"])
X_test = np.array(last_states["test"]["hidden_state"])
y_train = np.array(last_states["train"]["label"])
y_test = np.array(last_states["test"]["label"])

In [None]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression()
logit.fit(X_train, y_train)
logit.score(X_test, y_test)

## use the fine tuned model

In [None]:
custom_text = "I am glad the class is almost over."

In [None]:
input_tensor = tokenizer.encode(custom_text, return_tensors="pt").to(device)
with torch.no_grad():
    logits = model(input_tensor).logits.cpu()

In [None]:
import scipy

In [None]:
probs = scipy.special.softmax(logits.flatten())
probs