In [4]:
# train pretrained RoBERTa for sequence classification, NLI
# SNLI, MNLI, ANLI datasets for training
# code ref: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py

In [5]:
from pathlib import Path

PRETRAINED_MODEL_PATH = str(Path('.checkpoints') / 'snli' / 'save')
MODEL_CACHE_DIR = str(Path('.model'))
DATASET_CACHE_DIR = str(Path('.datasets'))
TRAINER_OUTPUT_DIR = str(Path('.checkpoints') / 'mnli')

In [6]:
from datasets import load_dataset

mnli = load_dataset('nyu-mll/multi_nli', cache_dir=DATASET_CACHE_DIR)

In [7]:
import random
from math import ceil
from typing import Any, Dict, List

from datasets import Dataset, concatenate_datasets
from tqdm.contrib import tenumerate
from transformers import PreTrainedTokenizer


def binarize_labels(
      dataset: Dataset
    , labels_to_pos: List[Any]
    , labels_to_neg: List[Any]
    , pos_label: int = 1
    , neg_label: int = 0
    , sample_seed: int = 42
    , shuffle_seed: int = 42
) -> Dataset:
  
    assert 'label' in dataset.features
    assert set(labels_to_pos).isdisjoint(labels_to_neg)
    random.seed(sample_seed)

    pos_label2indices: Dict[Any, List] = {}
    neg_label2indices: Dict[Any, List] = {}
    for index, label in tenumerate(dataset['label']):
        if label in labels_to_pos:
            pos_label2indices.setdefault(label, []) \
                             .append(index)
        if label in labels_to_neg:
            neg_label2indices.setdefault(label, []) \
                             .append(index)
 
    pos_num = sum(len(indices) for indices in pos_label2indices.values())
    neg_num = sum(len(indices) for indices in neg_label2indices.values())
    sample_ratio = min(pos_num, neg_num) / max(pos_num, neg_num)

    if pos_num < neg_num:
        for label, indices in neg_label2indices.items():
            sample_size = ceil(sample_ratio * len(indices))
            neg_label2indices[label] = random.sample(indices, sample_size)
    else:
        for label, indices in pos_label2indices.items():
            sample_size = ceil(sample_ratio * len(indices))
            pos_label2indices[label] = random.sample(indices, sample_size)

    def _map_labels_to_pos(batch):
        batch['label'] = [pos_label for _ in range(len(batch['label']))]
        return batch
    
    def _map_labels_to_neg(batch):
        batch['label'] = [neg_label for _ in range(len(batch['label']))]
        return batch

    dataset_balanced_binarized = concatenate_datasets(
              [dataset.select(indices)
                      .map(_map_labels_to_pos, batched=True, num_proc=4) 
               for indices in pos_label2indices.values()] 
            + [dataset.select(indices)
                      .map(_map_labels_to_neg, batched=True, num_proc=4) 
               for indices in neg_label2indices.values()]
        )

    return dataset_balanced_binarized.shuffle(seed=shuffle_seed)


def tokenize_premises_and_hypotheses(
      batch: Dict[str, List]
    , tokenizer: PreTrainedTokenizer
):
    # assumes all labels in the batch are available in `label_to_id`

    return tokenizer(
          text=batch['premise']
        , text_pair=batch['hypothesis']
        , truncation=True
        , max_length=tokenizer.model_max_length
        , padding=False                          # pad later dynamically with collator
        , return_attention_mask=True
        , return_token_type_ids=True
    )

In [8]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

# make sure to `entailment` is the SECOND for positive class
label_list = [ 'not_entailment', 'entailment' ]
label_to_id = { v: i for i, v in enumerate(label_list) }
id_to_label = { v: k for k, v in label_to_id.items() }

config = AutoConfig.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_PATH
    , num_labels=len(label_list)
    , finetuning_task='text-classification'
    , cache_dir=MODEL_CACHE_DIR
    , revision='main'
)

tokenizer = AutoTokenizer.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_PATH
    , cache_dir=MODEL_CACHE_DIR
    , revision='main'
    , use_fast_tokenizer=True
)

model = AutoModelForSequenceClassification.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_PATH
    , config=config
    , cache_dir=MODEL_CACHE_DIR
    , revision='main'
)
model.config.label2id = label_to_id
model.config.id2label = id_to_label

In [9]:
mnli_labels_to_pos = [0]     # `entailment` 
mnli_labels_to_neg = [1, 2]  # `neutral`, `contradiction` 

mnli_train = binarize_labels(
              mnli['train']
            , labels_to_pos=mnli_labels_to_pos
            , labels_to_neg=mnli_labels_to_neg
            , pos_label=label_to_id['entailment']
            , neg_label=label_to_id['not_entailment']
      ) \
      .map(
              lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer)
            , batched=True
            , num_proc=4
      )

mnli_eval = binarize_labels(
              mnli['validation_matched']
            , labels_to_pos=mnli_labels_to_pos
            , labels_to_neg=mnli_labels_to_neg
            , pos_label=label_to_id['entailment']
            , neg_label=label_to_id['not_entailment']
      ) \
      .map(
              lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer)
            , batched=True
            , num_proc=4
      )

mnli_eval = concatenate_datasets([
            binarize_labels(
                    mnli['validation_matched']
                  , labels_to_pos=mnli_labels_to_pos
                  , labels_to_neg=mnli_labels_to_neg
                  , pos_label=label_to_id['entailment']
                  , neg_label=label_to_id['not_entailment']
            ) \
            .map(
                  lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer)
                  , batched=True
                  , num_proc=4
            ),
            mnli_eval
      ])

  0%|          | 0/392702 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/130899 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/65449 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/65451 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/261799 [00:00<?, ? examples/s]

  0%|          | 0/9815 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/3479 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1715 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1765 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6959 [00:00<?, ? examples/s]

  0%|          | 0/9815 [00:00<?, ?it/s]

Map (num_proc=4):   0%|          | 0/6959 [00:00<?, ? examples/s]

In [10]:
# check dataset balance

from collections import Counter
print(Counter(mnli_train['label']))
print(Counter(mnli_eval['label']))

Counter({0: 130900, 1: 130899})
Counter({0: 6960, 1: 6958})


In [11]:
import evaluate
import numpy
import torch
from transformers import EvalPrediction, Trainer, TrainingArguments


use_mixed_precision = True and torch.cuda.is_available()
print(f'Using mixed precision: {use_mixed_precision}')

training_args = TrainingArguments(
          output_dir=TRAINER_OUTPUT_DIR
        , overwrite_output_dir=True         # to overwrite the output directory
        , do_train=True
        , do_eval=True
        , eval_strategy='epoch'             # to evaluate every epoch
        , save_strategy='epoch'             # to save the model every epoch
        , learning_rate=1e-5                # equivalent to DocNLI
        , num_train_epochs=10.0             # equivalent to 2 * DocNLI
        , per_device_train_batch_size=16
        , gradient_accumulation_steps=1     # batch_size ~ this * per_device_train_epoch_batch_size
        , per_device_eval_batch_size=16
        , fp16=use_mixed_precision          # to use mixed precision training
    )

metrics = evaluate.combine([
          evaluate.load('accuracy')
        , evaluate.load('precision')
        , evaluate.load('recall')
        , evaluate.load('f1')
    ])

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else \
            p.predictions
    preds = numpy.argmax(preds, axis=1)
    result = metrics.compute(predictions=preds, references=p.label_ids)
    return result

Using mixed precision: True


In [12]:
from transformers.data import DataCollatorWithPadding

data_collator = None
if training_args.fp16:
    data_collator = DataCollatorWithPadding(tokenizer)

mnli_trainer = Trainer(
          model=model
        , args=training_args
        , train_dataset=mnli_train
        , eval_dataset=mnli_eval
        , compute_metrics=compute_metrics
        , processing_class=tokenizer
        , data_collator=data_collator
    )

In [13]:
import os
import torch

try:
    train_result = mnli_trainer.train(resume_from_checkpoint=None)
    mnli_trainer.save_model(output_dir=os.path.join(TRAINER_OUTPUT_DIR, 'save'))
    mnli_trainer.save_metrics('train', train_result.metrics)

except KeyboardInterrupt:
    # HACK: when you interrrpt the training, GPU may not be initialized properly
    del model
    del mnli_trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    raise KeyboardInterrupt('Training interrupted by user.')

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3141,0.255432,0.900345,0.918306,0.878844,0.898142
2,0.2725,0.268533,0.900632,0.938907,0.856999,0.896085
3,0.2423,0.271467,0.913637,0.927765,0.897097,0.912173
4,0.2122,0.333555,0.910763,0.937806,0.879851,0.907904
5,0.1564,0.313831,0.911984,0.934121,0.886462,0.909667
6,0.1426,0.41509,0.918666,0.933611,0.901408,0.917227
7,0.1236,0.45624,0.911194,0.936527,0.88215,0.908526
8,0.0891,0.422946,0.916655,0.936202,0.894222,0.914731
9,0.0885,0.507304,0.915074,0.937046,0.889911,0.91287
10,0.0672,0.519323,0.916655,0.931784,0.899109,0.915155
