In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [1]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer

model_name = "google-bert/bert-base-uncased"

In [2]:
from datasets import load_dataset

dataset = load_dataset("lenatr99/Slovene_SuperGLUE_COPA")

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)



In [4]:
CONTEXT_COL = "premise"
QUESTION_COL = "question"
CHOICE_1_COL = "choice1"
CHOICE_2_COL = "choice2"


def preprocess_function(examples):
    """
    The preprocessing function needs to:
    1. Make two copies of the CONTEXT_COL field and combine each of them with QUESTION_COL to recreate how a sentence starts.
    2. Combine QUESTION_COL with each of the two possible choices.
    3. Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding input_ids, attention_mask, and labels field.
    """

    question_headers = examples[QUESTION_COL]

    # Repeat each premise two times to go with the two choice possibilities.
    first_sentences = [[context] * 2 for context in examples[CONTEXT_COL]]
    # Grab all choices possible for each context.
    second_sentences = [
        [f"{header} {examples[choice][i]}" for choice in [CHOICE_1_COL, CHOICE_2_COL]] for i, header
        in enumerate(question_headers)
    ]

    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)

    return {k: [v[i: i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

In [5]:
tokenized_dataset = dataset.map(preprocess_function,
                                remove_columns=['idx', 'premise', 'question', 'choice1', 'choice2'], batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [7]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
    label_ids = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(label_ids, preds, average="weighted")
    acc = accuracy_score(label_ids, preds)
    return {"accuracy": acc, "f1": f1}

In [8]:
from transformers import set_seed

set_seed(42)

model = AutoModelForMultipleChoice.from_pretrained(model_name)
model.config.use_cache = False

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from peft import LoraConfig, get_peft_model, TaskType

lora_alpha = 32
lora_rank_dropout = 0.1
lora_module_dropout = 0.0
lora_r = 16

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=lora_r,
    lora_alpha=lora_alpha,
    bias="none",
    base_model_name_or_path=model_name
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 590,593 || all params: 110,073,602 || trainable%: 0.5365


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [11]:
from transformers import TrainingArguments

new_model_name = "lora_fine_tuned_copa"

training_args = TrainingArguments(
    output_dir=new_model_name,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-3,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy='steps',
    max_steps=400,
    use_cpu=False,
    load_best_model_at_end=True
)

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['eval'],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

max_steps is given, it will override any value given in num_train_epochs


In [13]:
trainer.train()

  0%|          | 0/400 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.7084, 'grad_norm': 1.1963245868682861, 'learning_rate': 0.002625, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931522488594055, 'eval_accuracy': 0.49, 'eval_f1': 0.49045904590459044, 'eval_runtime': 1.8875, 'eval_samples_per_second': 52.981, 'eval_steps_per_second': 6.887, 'epoch': 1.0}
{'loss': 0.6986, 'grad_norm': 3.266575813293457, 'learning_rate': 0.0022500000000000003, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931469440460205, 'eval_accuracy': 0.61, 'eval_f1': 0.6095657455208017, 'eval_runtime': 0.4689, 'eval_samples_per_second': 213.257, 'eval_steps_per_second': 27.723, 'epoch': 2.0}
{'loss': 0.6975, 'grad_norm': 1.3292813301086426, 'learning_rate': 0.001875, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931471228599548, 'eval_accuracy': 0.51, 'eval_f1': 0.4904, 'eval_runtime': 0.4668, 'eval_samples_per_second': 214.23, 'eval_steps_per_second': 27.85, 'epoch': 3.0}
{'loss': 0.6953, 'grad_norm': 1.432462453842163, 'learning_rate': 0.0015, 'epoch': 4.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931471228599548, 'eval_accuracy': 0.54, 'eval_f1': 0.5331541959487391, 'eval_runtime': 0.4653, 'eval_samples_per_second': 214.911, 'eval_steps_per_second': 27.938, 'epoch': 4.0}
{'loss': 0.7156, 'grad_norm': 1.379629135131836, 'learning_rate': 0.0011250000000000001, 'epoch': 5.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931471228599548, 'eval_accuracy': 0.54, 'eval_f1': 0.5194766194766194, 'eval_runtime': 0.4632, 'eval_samples_per_second': 215.895, 'eval_steps_per_second': 28.066, 'epoch': 5.0}
{'loss': 0.7023, 'grad_norm': 1.3663792610168457, 'learning_rate': 0.00075, 'epoch': 6.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931471228599548, 'eval_accuracy': 0.5, 'eval_f1': 0.4971440228478172, 'eval_runtime': 0.4646, 'eval_samples_per_second': 215.23, 'eval_steps_per_second': 27.98, 'epoch': 6.0}
{'loss': 0.6998, 'grad_norm': 1.1478468179702759, 'learning_rate': 0.000375, 'epoch': 7.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931471228599548, 'eval_accuracy': 0.47, 'eval_f1': 0.4711184805547181, 'eval_runtime': 0.4647, 'eval_samples_per_second': 215.17, 'eval_steps_per_second': 27.972, 'epoch': 7.0}
{'loss': 0.7122, 'grad_norm': 1.7578575611114502, 'learning_rate': 0.0, 'epoch': 8.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931471228599548, 'eval_accuracy': 0.5, 'eval_f1': 0.49255890863993385, 'eval_runtime': 0.4646, 'eval_samples_per_second': 215.228, 'eval_steps_per_second': 27.98, 'epoch': 8.0}
{'train_runtime': 50.8965, 'train_samples_per_second': 62.873, 'train_steps_per_second': 7.859, 'train_loss': 0.7037102890014648, 'epoch': 8.0}


TrainOutput(global_step=400, training_loss=0.7037102890014648, metrics={'train_runtime': 50.8965, 'train_samples_per_second': 62.873, 'train_steps_per_second': 7.859, 'total_flos': 130275882637824.0, 'train_loss': 0.7037102890014648, 'epoch': 8.0})

In [14]:
trainer.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.6931471228599548,
 'eval_accuracy': 0.5,
 'eval_f1': 0.49255890863993385,
 'eval_runtime': 0.5401,
 'eval_samples_per_second': 185.14,
 'eval_steps_per_second': 24.068,
 'epoch': 8.0}

In [15]:
trainer.push_to_hub(new_model_name)



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lenatr99/lora_fine_tuned_copa/commit/94a0699a2d587527c788281bdc3c81e7e8b50408', commit_message='lora_fine_tuned_copa', commit_description='', oid='94a0699a2d587527c788281bdc3c81e7e8b50408', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
# Example
choice1 = "Naveličala sta se prepirov."
choice2 = "Izogibala sta se razgovoru o težavi."
prompt = "Odločila sta se skleniti kompromis."
question = "cause"

In [17]:
# We need to set the seed, otherwise some weights of the model are initialized differently every time, and consequently the result can be different each time as well
set_seed(42)

adapter_name = "lenatr99/" + new_model_name

tokenizer = AutoTokenizer.from_pretrained(adapter_name)
inputs = tokenizer([[prompt, f"{question} {choice1}"], [prompt, f"{question} {choice2}"]], return_tensors="pt",
                   padding=True)
labels = torch.tensor(0).unsqueeze(0)

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [18]:
model = AutoModelForMultipleChoice.from_pretrained(adapter_name)
outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
logits = outputs.logits

adapter_config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

In [19]:
# Print prediction
logits.argmax().item()

0