<a href="https://colab.research.google.com/github/TheoLpr/NLI_study/blob/main/Multi_task_Training_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Library setup

In [None]:
!pip install -q --upgrade transformers
!pip install nlp==0.2.0
!pip install datasets



In [None]:
!pip uninstall accelerate

!pip install accelerate==0.20.3

Found existing installation: accelerate 0.20.3
Uninstalling accelerate-0.20.3:
  Would remove:
    /usr/local/bin/accelerate
    /usr/local/bin/accelerate-config
    /usr/local/bin/accelerate-launch
    /usr/local/lib/python3.10/dist-packages/accelerate-0.20.3.dist-info/*
    /usr/local/lib/python3.10/dist-packages/accelerate/*
Proceed (Y/n)? y
  Successfully uninstalled accelerate-0.20.3
Collecting accelerate==0.20.3
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [None]:
import numpy as np
import torch
import torch.nn as nn
import transformers
import nlp
import logging
logging.basicConfig(level=logging.INFO)

##DATA LOADING


In [None]:
import datasets
data=datasets.load_dataset("snli")


Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

In [None]:
train_data = data["train"].filter(lambda _ex: _ex["label"] != -1).select(range(50000))
val_data = data["validation"].filter(lambda _ex: _ex["label"] != -1)
test_data = data["test"].filter(lambda _ex: _ex["label"] != -1)

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from datasets import DatasetDict

In [None]:
snli=DatasetDict({"train":train_data, "validation":val_data, "test":test_data})

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from datasets import load_from_disk

train_data2=load_from_disk("/content/gdrive/MyDrive/Colab Notebooks/train_data_inv_bert").remove_columns(["input_ids","attention_mask"])
test_data2=test_data
val_data2=load_from_disk("/content/gdrive/MyDrive/Colab Notebooks/val_data_inv_bert").remove_columns(["input_ids","attention_mask"])







In [None]:
snli_inv=DatasetDict({"train":train_data2, "validation":val_data2, "test":test_data2})

In [None]:
dataset_dict = {
    "snli": snli,
    "snli_inv": snli_inv
}

In [None]:
for task_name, dataset in dataset_dict.items():
    print(task_name)
    print(dataset_dict[task_name]["train"][0])
    print()

snli
{'premise': 'A person on a horse jumps over a broken down airplane.', 'hypothesis': 'A person is training his horse for a competition.', 'label': 1}

snli_inv
{'hypothesis': 'A person on a horse jumps over a broken down airplane.', 'premise': 'A person is training his horse for a competition.', 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]), 'label': tensor(2)}



## Creating a Multi-task Model


In [None]:
class MultitaskModel(transformers.PreTrainedModel):
    def __init__(self, encoder, taskmodels_dict):
        """
        Setting MultitaskModel up as a PretrainedModel allows us
        to take better advantage of Trainer features
        """
        super().__init__(transformers.PretrainedConfig())

        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)

    @classmethod
    def create(cls, model_name, model_type_dict, model_config_dict):
        """
        This creates a MultitaskModel using the model class and config objects
        from single-task models.

        We do this by creating each single-task model, and having them share
        the same encoder transformer.
        """
        shared_encoder = None
        taskmodels_dict = {}
        for task_name, model_type in model_type_dict.items():
            model = model_type.from_pretrained(
                model_name,
                config=model_config_dict[task_name],
            )
            if shared_encoder is None:
                shared_encoder = getattr(model, cls.get_encoder_attr_name(model))
            else:
                setattr(model, cls.get_encoder_attr_name(model), shared_encoder)
            taskmodels_dict[task_name] = model
        return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)

    @classmethod
    def get_encoder_attr_name(cls, model):
        """
        The encoder transformer is named differently in each model "architecture".
        This method lets us get the name of the encoder attribute
        """
        model_class_name = model.__class__.__name__
        if model_class_name.startswith("Bert"):
            return "bert"
        elif model_class_name.startswith("Roberta"):
            return "roberta"
        elif model_class_name.startswith("Albert"):
            return "albert"
        else:
            raise KeyError(f"Add support for new model {model_class_name}")

    def forward(self, task_name, **kwargs):
        return self.taskmodels_dict[task_name](**kwargs)

In [None]:
model_name = "bert-base-cased"
multitask_model = MultitaskModel.create(
    model_name=model_name,
    model_type_dict={
        "snli": transformers.AutoModelForSequenceClassification,
        "snli_inv": transformers.AutoModelForSequenceClassification,
    },
    model_config_dict={
        "snli": transformers.AutoConfig.from_pretrained(model_name, num_labels=3),
        "snli_inv": transformers.AutoConfig.from_pretrained(model_name, num_labels=3),
    },
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [None]:
if model_name.startswith("roberta-"):
    print(multitask_model.encoder.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["snli"].roberta.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["snli_inv"].roberta.embeddings.word_embeddings.weight.data_ptr())
else:
    print("Exercise for the reader: add a check for other model architectures =)")

Exercise for the reader: add a check for other model architectures =)


## Processing our task data

We have created a dictionary of NLP datasets above, but we need to do a little more work to convert the respective task data into model inputs.

We'll start by first getting the tokenizer corresponding to our model.

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [None]:
max_length = 128

def convert_to_snli_features(example_batch):
    inputs = list(zip(example_batch['premise'], example_batch['hypothesis']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_snli_inv_features(example_batch):
    inputs = list(zip(example_batch['hypothesis'], example_batch['premise']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features


convert_func_dict = {
    "snli": convert_to_snli_features,
    "snli_inv": convert_to_snli_inv_features,
}

In [None]:
columns_dict = {
    "snli": ['input_ids', 'attention_mask', 'labels'],
    "snli_inv": ['input_ids', 'attention_mask', 'labels'],
}

features_dict = {}
for task_name, dataset in dataset_dict.items():
    features_dict[task_name] = {}
    for phase, phase_dataset in dataset.items():
        features_dict[task_name][phase] = phase_dataset.map(
            convert_func_dict[task_name],
            batched=True,
            load_from_cache_file=False,
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))
        features_dict[task_name][phase].set_format(
            type="torch",
            columns=columns_dict[task_name],
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


snli train 50000 50000
snli train 50000 50000


Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

snli validation 9842 9842
snli validation 9842 9842


Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

snli test 9824 9824
snli test 9824 9824


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

snli_inv train 50000 50000
snli_inv train 50000 50000


Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

snli_inv validation 9842 9842
snli_inv validation 9842 9842


Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

snli_inv test 9824 9824
snli_inv test 9824 9824


## Preparing a multi-task data loader and Trainer


In [None]:
from transformers.data.data_collator import DataCollator, InputDataClass
from typing import List, Union, Dict

In [None]:
import dataclasses
from torch.utils.data.dataloader import DataLoader
from transformers.training_args import is_torch_tpu_available
#from transformers.trainer import _get_train_sampler

from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler





class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """
    def to(self, device):
        return self


class DataLoaderWithTaskname:
    """
    Wrapper around a DataLoader to also yield a task name
    """
    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)

    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch


class MultitaskDataloader:
    """
    Data loader that combines and samples from multiple single-task
    data loaders.
    """
    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset)
            for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        """
        For each batch, sample a task, and yield a batch from the respective
        task Dataloader.

        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])

class MultitaskTrainer(transformers.Trainer):

    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = (
            RandomSampler(train_dataset)
        )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
              train_dataset,
              batch_size=self.args.train_batch_size,
              sampler=train_sampler,
              #collate_fn=self.data_collator.collate_batch,
            ),
        )
        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each
        task Dataloader
        """
        return MultitaskDataloader({
            task_name: self.get_single_train_dataloader(task_name, task_dataset)
            for task_name, task_dataset in self.train_dataset.items()
        })

## Trainning part



In [None]:
from transformers import DefaultDataCollator

In [None]:
train_dataset = {
    task_name: dataset["train"]
    for task_name, dataset in features_dict.items()
}

trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir="/content/gdrive/MyDrive/Colab Notebooks",
        overwrite_output_dir=True,
        learning_rate=1e-5,
        do_train=True,
        num_train_epochs=1,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=16,
        save_steps=3000,
    ),
    data_collator=DefaultDataCollator(),
    train_dataset=train_dataset,
)
trainer.train()



Step,Training Loss
500,0.8804
1000,0.6754
1500,0.6039
2000,0.5669
2500,0.5431
3000,0.5075
3500,0.4909
4000,0.4873
4500,0.4765
5000,0.4669


TrainOutput(global_step=6250, training_loss=0.5467273046875, metrics={'train_runtime': 2181.4925, 'train_samples_per_second': 45.84, 'train_steps_per_second': 2.865, 'total_flos': 6578012620800000.0, 'train_loss': 0.5467273046875, 'epoch': 1.0})

In [None]:
torch.save(trainer,"/content/gdrive/MyDrive/Colab Notebooks/Multi_model_bert.pth")

trainer2= torch.load("/content/gdrive/MyDrive/Colab Notebooks/Multi_model_bert.pth")

In [None]:
preds_dict = {}
for task_name,dataset in features_dict.items():
  validation_dataset = {
    task_name: DataLoaderWithTaskname(
        task_name,
        data_loader=DataLoader(dataset["validation"], batch_size=16)
    )
  }
  val_dataloader = MultitaskDataloader(validation_dataset)
  # Hack: something is expecting the dataloader to have a batch size, which is available in the nested dataloader
  val_dataloader.batch_size = 16

  preds_dict[task_name] = trainer2.prediction_loop(
      val_dataloader,
      description="Validation: snli"
  )

In [None]:
snli_acc=np.mean(
    np.argmax(preds_dict["snli"].predictions[1],axis=1)
    == snli["validation"]['label']
)


In [None]:
snli_acc

In [None]:
preds_dict = {}
for task_name,dataset in features_dict.items():
  test_dataset = {
    task_name: DataLoaderWithTaskname(
        task_name,
        data_loader=DataLoader(dataset["test"], batch_size=16)
    )
  }
  test_dataloader = MultitaskDataloader(test_dataset)
  # Hack: something is expecting the dataloader to have a batch size, which is available in the nested dataloader
  test_dataloader.batch_size = 16

  preds_dict[task_name] = trainer2.prediction_loop(
      test_dataloader,
      description="Test: snli"
  )

In [None]:
torch.save(preds_dict,"/content/gdrive/MyDrive/Colab Notebooks/test_preds_multimodel_bert")