In [4]:
# train pretrained RoBERTa for sequence classification, NLI
# SNLI, MNLI, ANLI datasets for training
# code ref: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py

In [5]:
PRETRAINED_MODEL_NAME = 'roberta-large'
DATASET_CACHE_DIR = '.datasets/'
TRAINER_OUTPUR_DIR = '.checkpoints/'

In [3]:
from datasets import load_dataset

snli = load_dataset('stanfordnlp/snli', cache_dir=DATASET_CACHE_DIR)
mnli = load_dataset('nyu-mll/multi_nli', cache_dir=DATASET_CACHE_DIR)
anli = load_dataset('facebook/anli', cache_dir=DATASET_CACHE_DIR)

In [None]:
from typing import Any, Dict, List
from transformers import PreTrainedTokenizer

def tokenize_premises_and_hypotheses(
      batch: Dict[str, List]
    , tokenizer: PreTrainedTokenizer
    , label_to_id: Dict[Any, int]
):
    # assumes all labels in the batch are available in `label_to_id`

    tokenized_batch = tokenizer(
          batch['premise']
        , batch['hypothesis']
        , truncation=True
        , max_length=tokenizer.model_max_length
        , padding='max_length'
        , return_attention_mask=True
        , return_token_type_ids=True
    )
    tokenized_batch['label'] = [label_to_id[label] for label in batch['label']]
    return tokenized_batch

def are_labels_available(
      batch: Dict[str, List]
    , label_to_id: Dict[Any, int]
):
    return [label_to_id.get(label, -1) != -1 for label in batch['label']]

In [5]:
from transformers import RobertaConfig, RobertaForSequenceClassification

# make sure to `entailment` is the SECOND for positive class
label_list = [ 'not_entailment', 'entailment' ]
label_to_id = { v: i for i, v in enumerate(label_list) }
id_to_label = { v: k for k, v in label_to_id.items() }

config = RobertaConfig.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_NAME
    , num_labels=len(label_list)
    , finetuning_task='text-classification'
    , problem_type='single_label_classification'
)

model = RobertaForSequenceClassification.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_NAME
    , config=config
)
model.config.label2id = label_to_id
model.config.id2label = id_to_label

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_NAME
)

snli_label_to_id = { 0: label_to_id['entailment'], 1: label_to_id['not_entailment'], 2: label_to_id['not_entailment'] } 
mnli_label_to_id = { 0: label_to_id['entailment'], 1: label_to_id['not_entailment'], 2: label_to_id['not_entailment'] }
anli_label_to_id = { 0: label_to_id['entailment'], 1: label_to_id['not_entailment'], 2: label_to_id['not_entailment'] }  

snli_filter = lambda batch: are_labels_available(batch, snli_label_to_id)
mnli_filter = lambda batch: are_labels_available(batch, mnli_label_to_id)
anli_filter = lambda batch: are_labels_available(batch, anli_label_to_id)

snli_tokenize = lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, snli_label_to_id)
mnli_tokenize = lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, mnli_label_to_id)
anli_tokenize = lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, anli_label_to_id)

snli_tokenized = snli.filter(snli_filter, batched=True).map(snli_tokenize, batched=True, num_proc=4)
mnli_tokenized = mnli.filter(mnli_filter, batched=True).map(mnli_tokenize, batched=True, num_proc=4)
anli_tokenized = anli.filter(anli_filter, batched=True).map(anli_tokenize, batched=True, num_proc=4)

In [7]:
from datasets import concatenate_datasets

snli_train = snli_tokenized['train']
mnli_train = mnli_tokenized['train']
anli_train = concatenate_datasets([
      anli_tokenized['train_r1']
    , anli_tokenized['train_r2']
    , anli_tokenized['train_r3']
])

snli_eval = snli_tokenized['validation']
mnli_eval = concatenate_datasets([
      mnli_tokenized['validation_matched']
    , mnli_tokenized['validation_mismatched']
])
anli_eval = concatenate_datasets([
      anli_tokenized['dev_r1']
    , anli_tokenized['dev_r2']
    , anli_tokenized['dev_r3']
])

snli_test = snli_tokenized['test']
anli_test = concatenate_datasets([
      anli_tokenized['test_r1']
    , anli_tokenized['test_r2']
    , anli_tokenized['test_r3']
])

In [8]:
from transformers import  EvalPrediction, TrainingArguments, Trainer
from transformers.data import default_data_collator
import evaluate
import numpy
import torch

use_mixed_precision = True and torch.cuda.is_available()
print(f'Using mixed precision: {use_mixed_precision}')

training_args = TrainingArguments(
      output_dir=TRAINER_OUTPUR_DIR
    , overwrite_output_dir=True         # to overwrite the output directory
    , eval_strategy='epoch'             # to evaluate every epoch
    , save_strategy='epoch'             # to save the model every epoch
    , learning_rate=5e-5
    , num_train_epochs=3.0 
    , per_device_train_batch_size=2
    , per_device_eval_batch_size=2
    , fp16=use_mixed_precision          # to use mixed precision training
)

metrics = evaluate.combine([
      evaluate.load('accuracy')
    , evaluate.load('precision')
    , evaluate.load('recall')
    , evaluate.load('f1')
])

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else \
            p.predictions
    preds = numpy.argmax(preds, axis=1)
    result = metrics.compute(predictions=preds, references=p.label_ids)
    return result

Using mixed precision: False


In [13]:
from os import listdir
from os.path import isdir
from transformers.trainer_utils import get_last_checkpoint

last_checkpoint = None
if isdir(training_args.output_dir):
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None and len(listdir(training_args.output_dir)) > 0:
        raise ValueError(
            'Output directory ({}) already exists and is not empty. ' \
            'Use --overwrite_output_dir to overcome.'
            .format(training_args.output_dir)
        )

In [None]:
training_args.max_steps = 50_000
training_args.eval_strategy = 'steps'
training_args.eval_steps = 5_000

snli_trainer = Trainer(
      model=model
    , args=training_args
    , train_dataset=snli_train
    , eval_dataset=snli_eval
    , compute_metrics=compute_metrics
    , processing_class=tokenizer
    , data_collator=default_data_collator
)

import torch

try:
    snli_trainer.train(resume_from_checkpoint=None)

except KeyboardInterrupt:
    # HACK: when you interrrpt the training, GPU may not be initialized properly
    del model
    del snli_trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    raise KeyboardInterrupt('Training interrupted by user.')



Step,Training Loss,Validation Loss


KeyboardInterrupt: Training interrupted by user.