In [None]:
!pip install -U "transformers==4.40.0" --upgrade
!pip install accelerate bitsandbytes
!pip install datasets

Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.2
    Uninstalling transformers-4.40.2:
      Successfully uninstalled transformers-4.40.2
Successfully installed transformers-4.40.0
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cach

In [None]:
import torch as t
import time
import datasets

train_data = datasets.load_dataset("art", split="train")
test_data = datasets.load_dataset("art", split="validation")

Downloading readme:   0%|          | 0.00/6.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/209k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.98M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1532 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/169654 [00:00<?, ? examples/s]

In [None]:
from dataclasses import dataclass

@dataclass
class Config:
  batch_size: int
  model: str
  seed: int
  lr: float
  epochs: int
  lora_r: int
  lora_alpha: float

config = Config(
  batch_size = 64,
  model = "google-bert/bert-base-uncased",
  seed = 42,
  lr = 3e-5,
  epochs = 3,
  # For LoRA
  lora_r = 16,
  lora_alpha = 32,  # a common choice is alpha = 2 * rank
)

In [None]:
def set_seed(seed: int) -> None:
  import os
  import random

  import numpy as np

  np.random.seed(seed)
  random.seed(seed)
  t.manual_seed(seed)
  t.cuda.manual_seed(seed)
  # When running on the CuDNN backend, two further options must be set
  t.backends.cudnn.deterministic = True
  t.backends.cudnn.benchmark = False
  # Set a fixed value for the hash seed
  os.environ["PYTHONHASHSEED"] = str(seed)
  print(f"Random seed set as {seed}")

set_seed(config.seed)

Random seed set as 42


In [None]:
from transformers import AutoTokenizer, BertForMultipleChoice

tokenizer = AutoTokenizer.from_pretrained(config.model)
print(f"""
{tokenizer.name_or_path=}
{tokenizer.vocab_size=}
{tokenizer.model_max_length=}
""")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


tokenizer.name_or_path='google-bert/bert-base-uncased'
tokenizer.vocab_size=30522
tokenizer.model_max_length=512



In [None]:
def load_blank_model(config):
  # Delete references to a previously loaded model
  if "optimizer" in globals():
      global optimizer
      del optimizer
  if "model" in globals():
      global model
      del model

  # Free up GPU memory
  t.cuda.empty_cache()

  model = BertForMultipleChoice.from_pretrained(config.model)

  return model

model = load_blank_model(config)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_function(examples):

  combined_observations = [obs1 + " " + obs2 for obs1, obs2 in zip(examples['observation_1'], examples['observation_2'])]
  first_sentences = [[obs] * 2 for obs in combined_observations]
  second_sentences = [[hyp1, hyp2] for hyp1, hyp2 in zip(examples['hypothesis_1'], examples['hypothesis_2'])]

  first_sentences = sum(first_sentences, [])
  second_sentences = sum(second_sentences, [])

  output = tokenizer(first_sentences, second_sentences, truncation=True)
  output = {k: [v[i:i + 2] for i in range(0, len(v), 2)] for k, v in output.items()}

  # Adjust labels to be zero-based
  output['label'] = [label - 1 for label in examples['label']]

  return output

tokenized_train = train_data.map(preprocess_function, batched=True).shuffle(seed=config.seed)
tokenized_test = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/169654 [00:00<?, ? examples/s]

Map:   0%|          | 0/1532 [00:00<?, ? examples/s]

In [None]:
#Source https://www.kaggle.com/code/bennyfung/bert-model-for-multiple-choice#4.-Model-Fine-tuning-for-Bert-Multiple-Choice-LLM

from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union

@dataclass
class DataCollatorForMultipleChoice:

  tokenizer: PreTrainedTokenizerBase
  padding: Union[bool, str, PaddingStrategy] = True
  max_length: Optional[int] = None
  pad_to_multiple_of: Optional[int] = None

  def __call__(self, features):
    label_name = "label" if "label" in features[0].keys() else "labels"
    labels = [feature.pop(label_name) for feature in features]
    batch_size = len(features)
    num_choices = len(features[0]["input_ids"])
    flattened_features = [
        [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
        for feature in features
    ]
    flattened_features = sum(flattened_features, [])

    batch = self.tokenizer.pad(
        flattened_features,
        padding=self.padding,
        max_length=self.max_length,
        pad_to_multiple_of=self.pad_to_multiple_of,
        return_tensors="pt",
    )

    # Un-flatten
    batch = {
        k: v.reshape(batch_size, num_choices, -1) for k, v in batch.items()
    }

    # Add back labels
    batch["labels"] = t.tensor(labels, dtype=t.long)

    return batch

In [None]:
import numpy as np
from datasets import load_metric
from transformers import TrainingArguments, Trainer

data_collator = DataCollatorForMultipleChoice(tokenizer)

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

training_args = TrainingArguments(
    num_train_epochs=config.epochs,
    output_dir="out",
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    optim="adamw_torch",
    learning_rate=config.lr,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator = data_collator,
    tokenizer=tokenizer,  # this is used ONLY for adding the padding to a batch
    compute_metrics=compute_metrics,  # this will be called at the end of each epoch
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.412
1000,0.3128
1500,0.2809
2000,0.259
2500,0.2426
3000,0.167
3500,0.1346
4000,0.1355
4500,0.1314


Step,Training Loss
500,0.412
1000,0.3128
1500,0.2809
2000,0.259
2500,0.2426
3000,0.167
3500,0.1346
4000,0.1355
4500,0.1314
5000,0.1226


TrainOutput(global_step=7953, training_loss=0.1646202161269234, metrics={'train_runtime': 1967.3611, 'train_samples_per_second': 258.703, 'train_steps_per_second': 4.042, 'total_flos': 2.5978665796784304e+16, 'train_loss': 0.1646202161269234, 'epoch': 3.0})

In [None]:
trainer.evaluate()

  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 2.1126210689544678,
 'eval_accuracy': 0.6351174934725848,
 'eval_f1': 0.6319947333772219,
 'eval_runtime': 4.7068,
 'eval_samples_per_second': 325.488,
 'eval_steps_per_second': 5.099,
 'epoch': 3.0}

{'eval_loss': 2.1126210689544678,
 'eval_accuracy': 0.6351174934725848,
 'eval_f1': 0.6319947333772219,
 'eval_runtime': 4.7068,
 'eval_samples_per_second': 325.488,
 'eval_steps_per_second': 5.099,
 'epoch': 3.0}