In [1]:
# Define import
import os
import re
import torch
import gc
import ctypes
import numpy as np
import evaluate
from datasets import load_dataset
from torch.nn import MSELoss
from transformers import (
    PreTrainedTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertTokenizerFast,
    DataCollatorWithPadding,
)
from tokenizers import Tokenizer
from transformers import DataCollatorWithPadding

In [2]:
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
task_to_fields = {
    "qnli": ("question", "sentence"),
    "rte": ("sentence1", "sentence2"),
    "mnli": ("premise", "hypothesis"),
    "stsb": ("sentence1", "sentence2"),
    "sst2": ("sentence",),
    "qqp": ("question1", "question2"),
    "cola": ("sentence",),
    "mrpc": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
task_to_num_labels = {
    "rte": 2,
    "sst2": 2,
    "qqp": 2,
    "mnli": 3,
    "qnli": 2,
    "cola": 2,
    "mrpc": 2,
    "stsb": 1,
    "wnli": 2,
}

In [4]:
def sample_dataset(dataset, task_name, train_sample=2000, val_sample=200):
    train_sample = min(train_sample, len(dataset["train"]))

    if task_name == "mnli":
        # Handle both validation sets for MNLI
        val_sample = min(val_sample, len(dataset["validation_matched"]))
        dataset["train"] = dataset["train"].shuffle().select(range(train_sample))
        dataset["validation_matched"] = (
            dataset["validation_matched"].shuffle().select(range(val_sample))
        )
        dataset["validation_mismatched"] = (
            dataset["validation_mismatched"].shuffle().select(range(val_sample))
        )
    else:
        # For other tasks
        val_sample = min(val_sample, len(dataset["validation"]))
        dataset["train"] = dataset["train"].shuffle().select(range(train_sample))
        dataset["validation"] = (
            dataset["validation"].shuffle().select(range(val_sample))
        )

    # Optionally remove test set
    if "test" in dataset:
        del dataset["test"]

    return dataset

In [5]:
task_name = "mrpc"
mrpc_dataset = load_dataset("glue", task_name)
mrpc_dataset = sample_dataset(mrpc_dataset, task_name=task_name)
mrpc_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 200
    })
})

In [6]:
# tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased", num_labels=3, torch_dtype=torch.float16
# )
# print(model.name_or_path)
# print(model.config.num_labels)
# model.config.num_labels = 2
# print(model.config.num_labels)

In [7]:
def tokenize_function(examples, tokenizer, task_name):
    fields = task_to_fields.get(task_name, None)

    if not fields:
        raise ValueError(f"Task {task_name} not found in task_to_fields dictionary.")

    if len(fields) == 1:
        # sst2 case
        return tokenizer(examples[fields[0]], truncation=True, max_length=512)
    else:
        # the rest hopefully
        return tokenizer(
            examples[fields[0]], examples[fields[1]], truncation=True, max_length=512
        )

In [8]:
def preprocess_dataset(dataset, tokenizer, task_name):
    dataset = dataset.map(
        lambda examples: tokenize_function(examples, tokenizer, task_name), batched=True
    )
    return dataset

In [9]:
def compute_loss_for_task(model, inputs, task_name):
    if task_name == "stsb":
        # STS-B (regression) requires MSE loss
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = MSELoss()
        loss = loss_fct(logits.view(-1), labels.view(-1))
        return loss
    else:
        # Classification tasks (default)
        return model(**inputs).loss

In [10]:
class CustomTrainer(Trainer):
    def __init__(self, *args, task_name=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.task_name = task_name

    def compute_loss(self, model, inputs, return_outputs=False):
        # Get task name from the initialized class
        loss = compute_loss_for_task(model, inputs, self.task_name)
        return (loss, model(**inputs)) if return_outputs else loss

In [11]:
def setup_trainer(model, dataset, tokenizer, data_collator, task_name):
    # Define training arguments
    model_name = model.name_or_path.split("/")[-1]
    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        eval_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        learning_rate=5e-5,  # default 5e-5
        num_train_epochs=1,
        weight_decay=3e-5,
        logging_dir="./logs",
        logging_steps=10,
        fp16=True,
    )

    # Define Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=(
            dataset["validation"]
            if task_name != "mnli"
            else dataset["validation_matched"]
        ),
        tokenizer=tokenizer,
        data_collator=data_collator,
        task_name=task_name,
    )
    return trainer

In [12]:
def compute_metrics(trainer, dataset, task_name, file_path):
    model_name = "-".join(trainer.model.name_or_path.split("/")[-2:])
    if task_name == "mnli":
        # Compute metrics for both validation sets in MNLI
        matched_predictions = trainer.predict(dataset["validation_matched"])
        mismatched_predictions = trainer.predict(dataset["validation_mismatched"])

        matched_preds, matched_labels = (
            matched_predictions.predictions,
            matched_predictions.label_ids,
        )
        mismatched_preds, mismatched_labels = (
            mismatched_predictions.predictions,
            mismatched_predictions.label_ids,
        )

        metric = evaluate.load("glue", task_name)
        matched_result = metric.compute(
            predictions=np.argmax(matched_preds, axis=1), references=matched_labels
        )
        mismatched_result = metric.compute(
            predictions=np.argmax(mismatched_preds, axis=1),
            references=mismatched_labels,
        )

        # Log both matched and mismatched results
        with open(file_path, "a") as f:
            f.write(f"{model_name}\t{task_name}_matched\t{matched_result}\n")
            f.write(f"{model_name}\t{task_name}_mismatched\t{mismatched_result}\n")

        return matched_result, mismatched_result
    else:
        # Regular compute metrics for other tasks
        predictions = trainer.predict(dataset["validation"])
        preds, labels = predictions.predictions, predictions.label_ids
        metric = evaluate.load("glue", task_name)
        result = metric.compute(predictions=np.argmax(preds, axis=1), references=labels)

        # Log results
        with open(file_path, "a") as f:
            f.write(f"{model_name}\t{task_name}\t{result}\n")

        return result

In [13]:
def free_memory():
    gc.collect()
    torch.cuda.empty_cache()
    ctypes.CDLL("libcuda.so", mode=ctypes.RTLD_GLOBAL)
    libc = ctypes.CDLL("libc.so.6")  # clearing cache
    libc.malloc_trim(0)

In [14]:
def fine_tune_on_all_tasks(task_names, task_to_num_labels, file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    task_name = list(task_to_fields.keys())[0]
    num_labels = task_to_num_labels[task_name]
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=num_labels
    )
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

    for i, task_name in enumerate(task_names.keys()):
        num_labels = task_to_num_labels[task_name]
        if i > 0:
            model_path = os.listdir("results/bert-base-uncased/")[0]
            model = BertForSequenceClassification.from_pretrained(
                f"results/bert-base-uncased/{model_path}",
                num_labels=num_labels,
                ignore_mismatched_sizes=True,
            )
        print(f"Fine-tuning on {task_name} with {num_labels} labels.")
        print(f"Loading dataset {task_name} ...")
        dataset = load_dataset("glue", task_name)
        print("Sampling dataset ...")
        dataset = sample_dataset(dataset, task_name)
        print("Tokenizing dataset ...")
        dataset = preprocess_dataset(dataset, tokenizer, task_name)
        trainer = setup_trainer(model, dataset, tokenizer, data_collator, task_name)
        print("Training model ...")
        trainer.train()
        print("Evaluating model ...")
        compute_metrics(trainer, dataset, task_name, file_path)
    print("Done!")

In [15]:
fine_tune_on_all_tasks(task_to_fields, task_to_num_labels, "results.txt")
# print(os.listdir("results/bert-base-uncased/"))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine-tuning on qnli with 2 labels.
Loading dataset qnli ...
Sampling dataset ...
Tokenizing dataset ...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training model ...


  0%|          | 0/63 [00:00<?, ?it/s]

{'loss': 0.6938, 'grad_norm': 3.9766900539398193, 'learning_rate': 4.2063492063492065e-05, 'epoch': 0.16}
{'loss': 0.5947, 'grad_norm': 4.678134441375732, 'learning_rate': 3.412698412698413e-05, 'epoch': 0.32}
{'loss': 0.5486, 'grad_norm': 5.106545925140381, 'learning_rate': 2.6190476190476192e-05, 'epoch': 0.48}


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 59.12 MiB is free. Including non-PyTorch memory, this process has 6.89 GiB memory in use. Of the allocated memory 6.14 GiB is allocated by PyTorch, and 250.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

1