In [None]:
# train pretrained RoBERTa for sequence classification, NLI
# SNLI, MNLI, ANLI datasets for training
# code ref: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py

In [1]:
from datasets import load_dataset

snli = load_dataset('stanfordnlp/snli', cache_dir='.datasets/')
mnli = load_dataset('nyu-mll/multi_nli', cache_dir='.datasets/')
anli = load_dataset('facebook/anli', cache_dir='.datasets/')

Generating train_r1 split:   0%|          | 0/16946 [00:00<?, ? examples/s]

Generating dev_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r2 split:   0%|          | 0/45460 [00:00<?, ? examples/s]

Generating dev_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r3 split:   0%|          | 0/100459 [00:00<?, ? examples/s]

Generating dev_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Generating test_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [None]:
from typing import Any, Dict, List
from transformers import PreTrainedTokenizer

def tokenize_premises_and_hypotheses(
      batch: Dict[str, List]
    , tokenizer: PreTrainedTokenizer
    , label_to_id: Dict[Any, int]
):
    # assumes all labels in the batch are available in `label_to_id`

    tokenized_batch = tokenizer(
          batch['premise']
        , batch['hypothesis']
        , truncation=True
        , max_length=tokenizer.model_max_length
        , padding='max_length'
        , return_attention_mask=True
        , return_token_type_ids=True
    )
    tokenized_batch['label'] = [label_to_id[label] for label in batch['label']]
    return tokenized_batch

def are_labels_available(
      batch: Dict[str, List]
    , label_to_id: Dict[Any, int]
):
    return [label_to_id.get(label, -1) != -1 for label in batch['label']]


In [3]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(
      pretrained_model_name_or_path='roberta-large'
)

# entailment -> 0, otherwise -> 1
snli_label_to_id = { 0: 0, 1: 1, 2: 1 }
mnli_label_to_id = { 0: 0, 1: 1, 2: 1 }
anli_label_to_id = { 0: 0, 1: 1, 2: 1 }

snli_tokenized = snli.filter(lambda batch: are_labels_available(batch, snli_label_to_id), batched=True) \
                     .map(lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, snli_label_to_id), batched=True)
mnli_tokenized = mnli.filter(lambda batch: are_labels_available(batch, mnli_label_to_id), batched=True) \
                     .map(lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, mnli_label_to_id), batched=True)
anli_tokenized = anli.filter(lambda batch: are_labels_available(batch, anli_label_to_id), batched=True) \
                     .map(lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer, anli_label_to_id), batched=True)


Filter:   0%|          | 0/16946 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/45460 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100459 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/16946 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/45460 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

In [4]:
from transformers import RobertaConfig, RobertaForSequenceClassification

label_list = [ 'entailment', 'not_entailment' ]
label_to_id = { v: i for i, v in enumerate(label_list) }
id_to_label = { v: k for k, v in label_to_id.items() }

config = RobertaConfig.from_pretrained(
      pretrained_model_name_or_path='roberta-large'
    , num_labels=len(label_list)
    , finetuning_task='text-classification'
    , problem_type='single_label_classification'
)

model = RobertaForSequenceClassification.from_pretrained(
      pretrained_model_name_or_path='roberta-large'
    , config=config
)
model.config.label2id = label_to_id
model.config.id2label = id_to_label

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
      output_dir='./output'
    , eval_strategy='epoch'
    , num_train_epochs=5
)

snli_train = snli_tokenized['train']
mnli_train = mnli_tokenized['train']
anli_train = anli_tokenized['train_r3']

# TODO ref: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py#L584 ~~