<a href="https://colab.research.google.com/github/TheoLpr/NLI_study/blob/main/Multi_task_Training_full_probas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Library setup

In [None]:
!pip install -q --upgrade transformers
!pip install nlp==0.2.0
!pip install datasets



In [None]:
!pip uninstall accelerate

!pip install accelerate==0.20.3

Found existing installation: accelerate 0.20.3
Uninstalling accelerate-0.20.3:
  Would remove:
    /usr/local/bin/accelerate
    /usr/local/bin/accelerate-config
    /usr/local/bin/accelerate-launch
    /usr/local/lib/python3.10/dist-packages/accelerate-0.20.3.dist-info/*
    /usr/local/lib/python3.10/dist-packages/accelerate/*
Proceed (Y/n)? y
  Successfully uninstalled accelerate-0.20.3
Collecting accelerate==0.20.3
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/gdrive')
import numpy as np
import torch
import torch.nn as nn
import transformers
import nlp
import logging
logging.basicConfig(level=logging.INFO)

Mounted at /content/gdrive


##DATA LOADING


In [None]:
import datasets
data=datasets.load_dataset("snli")


Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

In [None]:
from datasets import DatasetDict

In [None]:
train_data=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/train_snli_probas.csv")
test_data=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/test_snli_probas.csv")
val_data=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/val_snli_probas.csv")

train_data_probas=train_data[["probas_label","index"]]
test_data_probas=test_data[["probas_label","index","gold_label"]]
val_data_probas=val_data[["probas_label","index","gold_label"]]

#change the type of the label because it was loaded as a string
l_train_probs=[]
for i in range(len(train_data_probas["probas_label"])):
  l = [float(x.strip(' []')) for x in train_data_probas["probas_label"][i].split(maxsplit=2)]
  l_train_probs.append(l)


l_val_probs=[]
for elt in val_data_probas["probas_label"]:
  l = [float(x.strip(' []')) for x in elt.split(maxsplit=2)]
  l_val_probs.append(l)

l_test_probs=[]
for elt in test_data_probas["probas_label"]:
  l = [float(x.strip(' []')) for x in elt.split(maxsplit=2)]
  l_test_probs.append(l)

# Drop examples with invalid labels
train_data = data["train"].select(list(train_data_probas["index"])).remove_columns("label").add_column("label",l_train_probs)
val_data = data["validation"].select(list(val_data_probas["index"])).remove_columns("label").add_column("label",l_val_probs)
test_data = data["test"].filter(lambda _ex: _ex["label"] != -1).remove_columns("label").add_column("label",l_test_probs)

real_train_labels=data["train"].select(list(train_data_probas["index"]))["label"]
real_val_labels=data["validation"].select(list(val_data_probas["index"]))["label"]
real_test_labels=data["test"].select(list(test_data_probas["index"]))["label"]


snli=DatasetDict({"train":train_data, "validation":val_data, "test":test_data})

Flattening the indices:   0%|          | 0/36776 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/9986 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/9824 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk

train_data2=load_from_disk("/content/gdrive/MyDrive/Colab Notebooks/train_data_inv_probas").remove_columns(["input_ids","attention_mask"])
test_data2=test_data
val_data2=load_from_disk("/content/gdrive/MyDrive/Colab Notebooks/val_data_inv_probas").remove_columns(["input_ids","attention_mask"])





In [None]:
snli

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 36776
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9986
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
})

In [None]:
snli_inv=DatasetDict({"train":train_data2, "validation":val_data2, "test":test_data2})

In [None]:
snli_inv["train"]

Dataset({
    features: ['hypothesis', 'premise', 'label'],
    num_rows: 36776
})

In [None]:
dataset_dict = {
    "snli": snli,
    "snli_inv": snli_inv
}

In [None]:
for task_name, dataset in dataset_dict.items():
    print(task_name)
    print(dataset_dict[task_name]["train"][0])
    print()

snli
{'premise': 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.', 'hypothesis': 'An elderly man sits in a small shop.', 'label': [0.41255094, 0.58744906, 0.0]}

snli_inv
{'hypothesis': 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.', 'premise': 'An elderly man sits in a small shop.', 'label': tensor([0.0036, 0.9876, 0.0088])}



## Creating a Multi-task Model


In [None]:
class MultitaskModel(transformers.PreTrainedModel):
    def __init__(self, encoder, taskmodels_dict):
        """
        Setting MultitaskModel up as a PretrainedModel allows us
        to take better advantage of Trainer features
        """
        super().__init__(transformers.PretrainedConfig())

        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)

    @classmethod
    def create(cls, model_name, model_type_dict, model_config_dict):
        """
        This creates a MultitaskModel using the model class and config objects
        from single-task models.

        We do this by creating each single-task model, and having them share
        the same encoder transformer.
        """
        shared_encoder = None
        taskmodels_dict = {}
        for task_name, model_type in model_type_dict.items():
            model = model_type.from_pretrained(
                model_name,
                config=model_config_dict[task_name],
            )
            if shared_encoder is None:
                shared_encoder = getattr(model, cls.get_encoder_attr_name(model))
            else:
                setattr(model, cls.get_encoder_attr_name(model), shared_encoder)
            taskmodels_dict[task_name] = model
        return cls(encoder=shared_encoder, taskmodels_dict=taskmodels_dict)

    @classmethod
    def get_encoder_attr_name(cls, model):
        """
        The encoder transformer is named differently in each model "architecture".
        This method lets us get the name of the encoder attribute
        """
        model_class_name = model.__class__.__name__
        if model_class_name.startswith("Bert"):
            return "bert"
        elif model_class_name.startswith("Roberta"):
            return "roberta"
        elif model_class_name.startswith("Albert"):
            return "albert"
        else:
            raise KeyError(f"Add support for new model {model_class_name}")

    def forward(self, task_name, **kwargs):
        return self.taskmodels_dict[task_name](**kwargs)

In [None]:
model_name = "roberta-base"
multitask_model = MultitaskModel.create(
    model_name=model_name,
    model_type_dict={
        "snli": transformers.AutoModelForSequenceClassification,
        "snli_inv": transformers.AutoModelForSequenceClassification,
    },
    model_config_dict={
        "snli": transformers.AutoConfig.from_pretrained(model_name, num_labels=3),
        "snli_inv": transformers.AutoConfig.from_pretrained(model_name, num_labels=3),
    },
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
if model_name.startswith("roberta-"):
    print(multitask_model.encoder.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["snli"].roberta.embeddings.word_embeddings.weight.data_ptr())
    print(multitask_model.taskmodels_dict["snli_inv"].roberta.embeddings.word_embeddings.weight.data_ptr())
else:
    print("Exercise for the reader: add a check for other model architectures =)")

139523156332608
139523156332608
139523156332608


## Processing our task data

We have created a dictionary of NLP datasets above, but we need to do a little more work to convert the respective task data into model inputs.

We'll start by first getting the tokenizer corresponding to our model.

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
max_length = 128

def convert_to_snli_features(example_batch):
    inputs = list(zip(example_batch['premise'], example_batch['hypothesis']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features

def convert_to_snli_inv_features(example_batch):
    inputs = list(zip(example_batch['hypothesis'], example_batch['premise']))
    features = tokenizer.batch_encode_plus(
        inputs, max_length=max_length, pad_to_max_length=True
    )
    features["labels"] = example_batch["label"]
    return features


convert_func_dict = {
    "snli": convert_to_snli_features,
    "snli_inv": convert_to_snli_inv_features,
}

In [None]:

snli_inv

DatasetDict({
    train: Dataset({
        features: ['hypothesis', 'premise', 'label'],
        num_rows: 36776
    })
    validation: Dataset({
        features: ['hypothesis', 'premise', 'label'],
        num_rows: 9986
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
})

In [None]:
columns_dict = {
    "snli": ['input_ids', 'attention_mask', 'labels'],
    "snli_inv": ['input_ids', 'attention_mask', 'labels'],
}

features_dict = {}
for task_name, dataset in dataset_dict.items():
    features_dict[task_name] = {}
    for phase, phase_dataset in dataset.items():
        features_dict[task_name][phase] = phase_dataset.map(
            convert_func_dict[task_name],
            batched=True,
            load_from_cache_file=False,
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))
        features_dict[task_name][phase].set_format(
            type="torch",
            columns=columns_dict[task_name],
        )
        print(task_name, phase, len(phase_dataset), len(features_dict[task_name][phase]))

Map:   0%|          | 0/36776 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


snli train 36776 36776
snli train 36776 36776


Map:   0%|          | 0/9986 [00:00<?, ? examples/s]

snli validation 9986 9986
snli validation 9986 9986


Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

snli test 9824 9824
snli test 9824 9824


Map:   0%|          | 0/36776 [00:00<?, ? examples/s]

snli_inv train 36776 36776
snli_inv train 36776 36776


Map:   0%|          | 0/9986 [00:00<?, ? examples/s]

snli_inv validation 9986 9986
snli_inv validation 9986 9986


Map:   0%|          | 0/9824 [00:00<?, ? examples/s]

snli_inv test 9824 9824
snli_inv test 9824 9824


## Preparing a multi-task data loader and Trainer


In [None]:
from transformers.data.data_collator import DataCollator, InputDataClass
from typing import List, Union, Dict

In [None]:
import dataclasses
from torch.utils.data.dataloader import DataLoader
from transformers.training_args import is_torch_tpu_available
#from transformers.trainer import _get_train_sampler

from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler





class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """
    def to(self, device):
        return self


class DataLoaderWithTaskname:
    """
    Wrapper around a DataLoader to also yield a task name
    """
    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)

    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch


class MultitaskDataloader:
    """
    Data loader that combines and samples from multiple single-task
    data loaders.
    """
    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset)
            for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        """
        For each batch, sample a task, and yield a batch from the respective
        task Dataloader.

        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])

class MultitaskTrainer(transformers.Trainer):

    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = (
            RandomSampler(train_dataset)
        )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
              train_dataset,
              batch_size=self.args.train_batch_size,
              sampler=train_sampler,
              #collate_fn=self.data_collator.collate_batch,
            ),
        )
        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each
        task Dataloader
        """
        return MultitaskDataloader({
            task_name: self.get_single_train_dataloader(task_name, task_dataset)
            for task_name, task_dataset in self.train_dataset.items()
        })

## Trainning part



In [None]:
from transformers import DefaultDataCollator

In [None]:
train_dataset = {
    task_name: dataset["train"]
    for task_name, dataset in features_dict.items()
}

trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir="/content/gdrive/MyDrive/Colab Notebooks",
        overwrite_output_dir=True,
        learning_rate=1e-5,
        do_train=True,
        num_train_epochs=1,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=16,
        save_steps=3000,
    ),
    data_collator=DefaultDataCollator(),
    train_dataset=train_dataset,
)
trainer.train()


In [None]:
torch.save(trainer, "/content/gdrive/MyDrive/Colab Notebooks/model_full_probas.pth")

In [None]:
trainer2= torch.load("/content/gdrive/MyDrive/Colab Notebooks/model_full_probas.pth")

In [None]:
preds_dict = {}
for task_name,dataset in features_dict.items():
  test_dataset = {
    task_name: DataLoaderWithTaskname(
        task_name,
        data_loader=DataLoader(dataset["test"], batch_size=16)
    )
  }
  test_dataloader = MultitaskDataloader(test_dataset)
  # Hack: something is expecting the dataloader to have a batch size, which is available in the nested dataloader
  test_dataloader.batch_size = 16

  preds_dict[task_name] = trainer2.prediction_loop(
      test_dataloader,
      description="Test: snli"
  )

Not all data has been set. Are you sure you passed all values?
Not all data has been set. Are you sure you passed all values?


In [None]:
snli_inv

DatasetDict({
    train: Dataset({
        features: ['hypothesis', 'premise', 'label'],
        num_rows: 36776
    })
    validation: Dataset({
        features: ['hypothesis', 'premise', 'label'],
        num_rows: 9986
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
})

In [None]:
torch.save(preds_dict,"/content/gdrive/MyDrive/Colab Notebooks/test_preds_multimodel_full_probas")