In [3]:
# train pretrained RoBERTa for sequence classification, NLI
# SNLI, MNLI, ANLI datasets for training
# code ref: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py

In [4]:
from datasets import load_dataset

snli = load_dataset('stanfordnlp/snli', cache_dir='.datasets/')
mnli = load_dataset('nyu-mll/multi_nli', cache_dir='.datasets/')
anli = load_dataset('facebook/anli', cache_dir='.datasets/')

In [5]:
from typing import Any, Dict, List
from transformers import PreTrainedTokenizer

def tokenize_premises_and_hypotheses(
      batch: Dict[str, List]
    , tokenizer: PreTrainedTokenizer
    , label_to_id: Dict[Any, int]
):
    # assumes all labels in the batch are available in `label_to_id`

    tokenized_batch = tokenizer(
          batch['premise']
        , batch['hypothesis']
        , truncation=True
        , max_length=tokenizer.model_max_length
        , padding='max_length'
        , return_attention_mask=True
        , return_token_type_ids=True
    )
    tokenized_batch['label'] = [label_to_id[label] for label in batch['label']]
    return tokenized_batch

def are_labels_available(
      batch: Dict[str, List]
    , label_to_id: Dict[Any, int]
):
    return [label_to_id.get(label, -1) != -1 for label in batch['label']]


In [6]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(
      pretrained_model_name_or_path='roberta-large'
)

# convert 3-way NLI labels into binary classification
# original label ids:
#   - SNLI: 0 = entailment, 1 = neutral, 2 = contradiction, see: https://huggingface.co/datasets/stanfordnlp/snli
#   - MNLI: 0 = entailment, 1 = neutral, 2 = contradiction, see: https://huggingface.co/datasets/nyu-mll/multi_nli
#   - ANLI: 0 = entailment, 1 = neutral, 2 = contradiction, see: https://huggingface.co/datasets/facebook/anli
snli_label_to_id = { 0: 0, 1: 1, 2: 1 }
mnli_label_to_id = { 0: 0, 1: 1, 2: 1 }
anli_label_to_id = { 0: 0, 1: 1, 2: 1 }

snli_tokenized = snli.filter(lambda batch: are_labels_available(batch, snli_label_to_id), batched=True) \
                     .map(lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, snli_label_to_id), batched=True)
mnli_tokenized = mnli.filter(lambda batch: are_labels_available(batch, mnli_label_to_id), batched=True) \
                     .map(lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, mnli_label_to_id), batched=True)
anli_tokenized = anli.filter(lambda batch: are_labels_available(batch, anli_label_to_id), batched=True) \
                     .map(lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, anli_label_to_id), batched=True)


In [7]:
from transformers import RobertaConfig, RobertaForSequenceClassification

label_list = [ 'entailment', 'not_entailment' ]
label_to_id = { v: i for i, v in enumerate(label_list) }
id_to_label = { v: k for k, v in label_to_id.items() }

config = RobertaConfig.from_pretrained(
      pretrained_model_name_or_path='roberta-large'
    , num_labels=len(label_list)
    , finetuning_task='text-classification'
    , problem_type='single_label_classification'
)

model = RobertaForSequenceClassification.from_pretrained(
      pretrained_model_name_or_path='roberta-large'
    , config=config
)
model.config.label2id = label_to_id
model.config.id2label = id_to_label

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from datasets import concatenate_datasets

snli_train = snli_tokenized['train']
mnli_train = mnli_tokenized['train']
anli_train = concatenate_datasets([
      anli_tokenized['train_r1']
    , anli_tokenized['train_r2']
    , anli_tokenized['train_r3']
])

snli_eval = snli_tokenized['validation']
mnli_eval = concatenate_datasets([
      mnli_tokenized['validation_matched']
    , mnli_tokenized['validation_mismatched']
])
anli_eval = concatenate_datasets([
      anli_tokenized['dev_r1']
    , anli_tokenized['dev_r2']
    , anli_tokenized['dev_r3']
])

snli_test = snli_tokenized['test']
anli_test = concatenate_datasets([
      anli_tokenized['test_r1']
    , anli_tokenized['test_r2']
    , anli_tokenized['test_r3']
])

In [14]:
from transformers import TrainingArguments
import evaluate

args = TrainingArguments(
      output_dir='./output'
    , eval_strategy='epoch'
    , num_train_epochs=5
)

metrics = evaluate.combine([
      evaluate.load('accuracy')
    , evaluate.load('precision')
    , evaluate.load('recall')
    , evaluate.load('f1')
])

# TODO ref: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py#L584 ~~

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]