In [1]:
#We will be using packages: transformers, torch, nlp, datasets
#!pip install accelerate==0.3.0 transformers==4.10.3 nlp==0.4.0 datasets==1.9.0 torch==1.13.0
!pip install accelerate transformers nlp datasets torch==1.13.0

Defaulting to user installation because normal site-packages is not writeable


In [18]:
import json
import pandas as pd
import numpy as np
import transformers
import datasets
from datasets import load_dataset, dataset_dict
import nlp
import dataclasses
import matplotlib.pyplot as plt
import seaborn as sns
import transformers

In [19]:
#torch 
import torch
import torch.nn  as nn
from torch.utils.data.dataloader import DataLoader
#from transformers.training_args import is_tpu_available
#from transformers.trainer.file_utils import  is_torch_tpu_available as is_tpu_available
#from transformers.trainer import get_tpu_sampler
from transformers import DataCollator
from transformers.data.data_collator import InputDataClass
#from transformers.data.data_collator import DataCollator, InputDataClass
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
import torch_xla
#import torch_xla.debug.profiler as xp
#import torch_xla.utils.keyd_queue as kq
#import torch_xla.utils.utils as xu
#import torch_xla.core.xla_model as xm
from typing import List, Union, Dict


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
#Loading datasets 
dataset_dict = {
    "mnli": datasets.load_dataset('glue', 'mnli'),
    "commonsense_qa": datasets.load_dataset('commonsense_qa',name='commonsense_qa'),
    "stsb_multi_mt": datasets.load_dataset("stsb_multi_mt",name="en")
}

Reusing dataset glue (/home/rjansevanrensburg/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Using custom data configuration commonsense_qa
Reusing dataset commonsense_qa (/home/rjansevanrensburg/.cache/huggingface/datasets/commonsense_qa/commonsense_qa/0.1.0/1ca2d7b680c5bd93c0dc85f9cb65c0c8817e759ff82e405b28de54e83efa80f7)
Reusing dataset stsb_multi_mt (/home/rjansevanrensburg/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/bc6de0eaa8d97c28a4c22a07e851b05879ae62c60b0b69dd6b331339e8020f07)


In [4]:
for key, value in  dataset_dict.items():
    print("task:", key)
    print(dataset_dict[key]['train'][2])
    print("\n")

task: mnli
{'premise': 'One of our number will carry out your instructions minutely.', 'hypothesis': 'A member of my team will execute your orders with immense precision.', 'label': 0, 'idx': 2}


task: commonsense_qa
{'answerKey': 'A', 'question': 'To locate a choker not located in a jewelry box or boutique where would you go?', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['jewelry store', 'neck', 'jewlery box', 'jewelry box', 'boutique']}}


task: stsb_multi_mt
{'sentence1': 'A man is spreading shreded cheese on a pizza.', 'sentence2': 'A man is spreading shredded cheese on an uncooked pizza.', 'similarity_score': 3.799999952316284}




In [5]:
class MultitaskModel(transformers.PreTrainedModel):

    def __init__(self, encoder, taskmodels_dict):
        
        """
        Setting MultitaskModel up as a PretrainedModel allows us
        to take better advantage of Trainer features
        """
        super().__init__(transformers.PretrainedConfig())

        self.encoder = encoder
        self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)
    
    @classmethod
    def create_model(cls, model_name, model_type_dict, model_config_dict):
        """
        This creates a MultitaskModel using the model class and config objects
        from single-task models. 

        We do this by creating each single-task model, and having them share
        the same encoder transformer.
        """
        shared_encoder = None
        task_models_dict = {}
        for task, model_type in model_type_dict.items():
            print(task)
            print(model_type)
            model = model_type.from_pretrained(
                model_name, 
                config=model_config_dict[task],
            )
            if shared_encoder is None:
                shared_encoder = getattr(model, cls.get_encoder_attr_name(model))
            else:
                setattr(model, cls.get_encoder_attr_name(model), shared_encoder)
            task_models_dict[task] = model
        return cls(encoder=shared_encoder, taskmodels_dict=task_models_dict)
    
        
    @classmethod              
    def get_encoder_attr_name(cls, model):
        
        
        """
        Each encoder has its attributes according to model architecture: BERT, Roberta,Alberta 
        This function gets attribute of the encoder.
        """
        model_class_name = model.__class__.__name__
        if model_class_name.startswith('Bert'):
            return 'bert'
        if model_class_name.startswith('Roberta'):
            return 'roberta'
        if model_class_name.startswith('Albert'):
            return 'albert'
        if model_class_name.startswith():
            return 
            
        else:
                raise KeyError(f"Add support for new model {model_class_name}")
                
        
    def forward(self, task, **kwargs):
        return self.taskmodels_dict[task](**kwargs)

In [16]:
model_name = 'bert-base-uncased'
multitask_model=MultitaskModel.create_model(model_name=model_name, 
         model_type_dict={
                 "mnli" : transformers.AutoModelForSequenceClassification,
                 "commonsense_qa": transformers.AutoModelForMultipleChoice,
                 "stsb_multi_mt"          : transformers.AutoModelForSequenceClassification
         },
         model_config_dict={
                 "mnli" : transformers.AutoConfig.from_pretrained(model_name,num_labels=1),
                 "commonsense_qa": transformers.AutoConfig.from_pretrained(model_name),
                 "stsb_multi_mt"          : transformers.AutoConfig.from_pretrained(model_name,num_labels=1)
             
         }                                   
            )

mnli
<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

commonsense_qa
<class 'transformers.models.auto.modeling_auto.AutoModelForMultipleChoice'>


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

stsb_multi_mt
<class 'transformers.models.auto.modeling_auto.AutoModelForSequenceClassification'>


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
tokenizer=transformers.AutoTokenizer.from_pretrained(model_name)

In [8]:
max_length=256
def convert_to_mnli_inputs(example_batch):
    
    inputs=list(zip(example_batch['premise'],example_batch['hypothesis']))
    features=tokenizer.batch_encode_plus(inputs, max_length=max_length, 
                                         pad_to_max_length=True,truncation=True)
    features['labels']=example_batch['label']
    return features

def convert_to_stsb_multi_mt_inputs(example_batch):
    
    inputs=list(zip(example_batch['sentence1'],example_batch['sentence2']))
    features=tokenizer.batch_encode_plus(inputs, max_length=max_length, 
                                         pad_to_max_length=True,truncation=True)
    features['labels']=example_batch['similarity_score']
    return features

def convert_to_commonsense_qa_inputs(example_batch):
    
    number_examples=len(example_batch['question'])
    number_choices=len(example_batch['choices'][0]['text'])
    features={}
    for example in range(number_examples):
        choices=tokenizer.batch_encode_plus(
            list(zip([example_batch['question'][example]]*number_choices,
                        example_batch['choices'][example]['text'])),
            max_length=max_length, pad_to_max_length=True ,truncation=True
                    )
        for k,v in choices.items():
            if k not in  features:
                features[k]=[]
            features[k].append(v)
    labels2id={char: x for x, char in enumerate('ABCDE')}
    if example_batch['answerKey'][0]:
        features['labels']= [labels2id[ans] for ans in example_batch['answerKey']]
    else:
        features['labels']=[0]*number_examples
    return features

#Construct the featurized input data
featurized_funct_dict={
                 "mnli"          : convert_to_mnli_inputs,
                 "commonsense_qa": convert_to_commonsense_qa_inputs,
                 "stsb_multi_mt" : convert_to_stsb_multi_mt_inputs
}

column_dict={
                 "mnli"          : ['input_ids', 'attention_mask','labels'],
                 "commonsense_qa": ['input_ids', 'attention_mask','labels'],
                 "stsb_multi_mt" : ['input_ids', 'attention_mask','labels']
}
#Featurizing datasets
features_dict={}
for  task, dataset in dataset_dict.items():
    print("--------------task---------:",task)
    features_dict[task]={}
    for phase, phase_dataset in dataset.items():
       
        features_dict[task][phase]=phase_dataset.map(featurized_funct_dict[task],
                                                     batched=True, 
                                                     load_from_cache_file=False)
        print(task, phase, len(phase_dataset), len(features_dict[task][phase]))
        features_dict[task][phase].set_format(
            type='torch',
            columns=column_dict[task]
        )
        print(task, phase, len(phase_dataset), len(features_dict[task][phase]))



--------------task---------: mnli


  0%|          | 0/393 [00:00<?, ?ba/s]



mnli train 392702 392702
mnli train 392702 392702


  0%|          | 0/10 [00:00<?, ?ba/s]

mnli validation_matched 9815 9815
mnli validation_matched 9815 9815


  0%|          | 0/10 [00:00<?, ?ba/s]

mnli validation_mismatched 9832 9832
mnli validation_mismatched 9832 9832


  0%|          | 0/10 [00:00<?, ?ba/s]

mnli test_matched 9796 9796
mnli test_matched 9796 9796


  0%|          | 0/10 [00:00<?, ?ba/s]

mnli test_mismatched 9847 9847
mnli test_mismatched 9847 9847
--------------task---------: commonsense_qa


  0%|          | 0/10 [00:00<?, ?ba/s]

commonsense_qa train 9741 9741
commonsense_qa train 9741 9741


  0%|          | 0/2 [00:00<?, ?ba/s]

commonsense_qa validation 1221 1221
commonsense_qa validation 1221 1221


  0%|          | 0/2 [00:00<?, ?ba/s]

commonsense_qa test 1140 1140
commonsense_qa test 1140 1140
--------------task---------: stsb_multi_mt


  0%|          | 0/6 [00:00<?, ?ba/s]

stsb_multi_mt train 5749 5749
stsb_multi_mt train 5749 5749


  0%|          | 0/2 [00:00<?, ?ba/s]

stsb_multi_mt test 1379 1379
stsb_multi_mt test 1379 1379


  0%|          | 0/2 [00:00<?, ?ba/s]

stsb_multi_mt dev 1500 1500
stsb_multi_mt dev 1500 1500


In [9]:
class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """
    def to(self, device):
        return self        

In [41]:
class NLPDataCollator:#(DataCollator):
    def collate_batch(self, features:List[Union[InputDataClass,Dict]]) -> Dict[str,torch.Tensor]:
        first=features[0]
        if isinstance(first,dict):
            
            #featurized dataset are in the form of list of dictionaries
            #Adapt the DataCollator to have a list of dictionary
            if "labels" in first and first["labels"] is not None:
                if first["labels"].dtype == torch.int64:
                    labels = torch.tensor(
                        [f["labels"] for f in features], dtype=torch.float
                    )
                else:
                    labels = torch.tensor(
                        [f["labels"] for f in features], dtype=torch.float
                    )
                batch = {"labels": labels}
            for k, v in first.items():
                if k != "labels" and v is not None and not isinstance(v, str):
                    batch[k] = torch.stack([f[k] for f in features])
            return batch
        else:
            # otherwise, revert to using the default collate_batch
            return DefaultDataCollator().collate_batch(features)
        
    def __call__(self, features:List[Union[InputDataClass,Dict]]) -> Dict[str,torch.Tensor]:
        first=features[0]
        if isinstance(first,dict):
            
            #featurized dataset are in the form of list of dictionaries
            #Adapt the DataCollator to have a list of dictionary
            if "labels" in first and first["labels"] is not None:
                if first["labels"].dtype == torch.int64:
                    labels = torch.tensor(
                        [f["labels"] for f in features], dtype=torch.float
                    )
                else:
                    labels = torch.tensor(
                        [f["labels"] for f in features], dtype=torch.float
                    )
                batch = {"labels": labels}
            for k, v in first.items():
                if k != "labels" and v is not None and not isinstance(v, str):
                    batch[k] = torch.stack([f[k] for f in features])
            return batch
        else:
            # otherwise, revert to using the default collate_batch
            return DefaultDataCollator().collate_batch(features)

                    


In [37]:
#Class to load data with its task name. Decorator for changing Dataloader function to use a task name
class DataLoaderTaskname:
    def __init__(self, task, data_loader):
        self.task=task
        self.data_loader=data_loader
        self.batch_size=data_loader.batch_size
        self.dataset=data_loader.dataset
        
    def __len__(self):
        return len(self.data_loader)
        
    def __iter__(self):
        for batch in self.data_loader:
            batch["task"]=StrIgnoreDevice(self.task)
            yield batch

In [38]:
#Class to combine several data loaders into a single "data loader" 
class MultitaskDataLoader:
    
    def __init__(self, dataloader_dict):
        self.dataloader_dict =dataloader_dict
        self.num_batches_dict= {task:len(dataloader) for task, dataloader in self.dataloader_dict.items()}
        self.task_lst        =list(self.dataloader_dict)
        self.dataset = [None] * sum(len(dataloader.dataset) for dataloader in self.dataloader_dict.values())
    def __len__(self):
        return sum(self.num_batches_dict.values())
    
    def __iter__(self):
        """
        For each batch, sample a task, and get a batch from the respective task Dataloader.

        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task in enumerate(self.task_lst):
            task_choice_list += [i] * self.num_batches_dict[task]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task: iter(dataloader) 
            for task, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task = self.task_lst[task_choice]
            yield next(dataloader_iter_dict[task]) 

In [39]:
#class to set up the trainer 
class MultitaskTrainer(transformers.Trainer):
    
    def single_task_dataloader(self,task,train_dataset):
        """
        returns the single task data loader of a given task 
        """
        if self.train_dataset is None:
            raise ValueError("Trainer needs a dataset...:(")
        #if torch.cuda.is_available():#is_tpu_available():
            #train_sampler=get_tpu_sampler(train_dataset)
        else:
            train_sampler=(RandomSampler(train_dataset)  if self.args.local_rank== -1 
                                                            else DistributedSampler(train_dataset))
    
        data_loader=DataLoaderTaskname(task=task,data_loader=DataLoader(train_dataset,
                                                                       batch_size=self.args.train_batch_size,
                                                                       sampler=train_sampler,
                                                                       collate_fn=self.data_collator.collate_batch
                                                                      ))   
        #if torch.cuda.is_available():#is_tpu_available():
            #data_loader=pl.ParallelLoader(
                #data_loader, [self.args.device]
            #).per_device_loader(self.args.device)
        return data_loader
    
    
    def get_train_dataloader(self):
        """
        Returns a MultitaskDataLoader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each 
        task Dataloader
        """
        return MultitaskDataLoader({
            task: self.single_task_dataloader(task, task_dataset)
            for task, task_dataset in self.train_dataset.items()
        })

In [42]:
train_dataset = {
    task: dataset["train"] 
    for task, dataset in features_dict.items()
}
trainer = MultitaskTrainer(
    model=multitask_model,
    args=transformers.TrainingArguments(
        output_dir="./models/multitask_model",
        overwrite_output_dir=True,
        learning_rate=1e-5,
        do_train=True,
        num_train_epochs=2,
        # Adjust batch size if this doesn't fit on the Colab GPU
        per_device_train_batch_size=8,  
        save_steps=3000,
    ),
    data_collator=NLPDataCollator(),
    train_dataset=train_dataset,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 408192
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 102050


Step,Training Loss


TypeError: expected Tensor as element 0 in argument 0, but got list