In [1]:
# train pretrained RoBERTa for sequence classification, NLI
# SNLI, MNLI, ANLI datasets for training
# code ref: https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py

In [2]:
from pathlib import Path

PRETRAINED_MODEL_NAME = 'roberta-large'
MODEL_CACHE_DIR = str(Path('.model'))
DATASET_CACHE_DIR = str(Path('.datasets'))
TRAINER_OUTPUT_DIR = str(Path('.checkpoints') / 'snli')

In [3]:
from datasets import load_dataset

snli = load_dataset('stanfordnlp/snli', cache_dir=DATASET_CACHE_DIR)

In [4]:
import random
from math import ceil
from typing import Any, Dict, List

import numpy
from datasets import Dataset, concatenate_datasets
from tqdm.contrib import tenumerate
from transformers import PreTrainedTokenizer


# def binarize_labels(
#       dataset: Dataset
#     , labels_to_pos: List[Any]
#     , labels_to_neg: List[Any]
#     , pos_label: int = 1
#     , neg_label: int = 0
#     , sample_seed: int = 42
#     , shuffle_seed: int = 42
# ) -> Dataset:
  
#     assert 'label' in dataset.features
#     assert set(labels_to_pos).isdisjoint(labels_to_neg)
#     random.seed(sample_seed)

#     pos_label2indices: Dict[Any, List] = {}
#     neg_label2indices: Dict[Any, List] = {}
#     for index, label in tenumerate(dataset['label']):
#         if label in labels_to_pos:
#             pos_label2indices.setdefault(label, []) \
#                              .append(index)
#         if label in labels_to_neg:
#             neg_label2indices.setdefault(label, []) \
#                              .append(index)
 
#     pos_num = sum(len(indices) for indices in pos_label2indices.values())
#     neg_num = sum(len(indices) for indices in neg_label2indices.values())
#     sample_ratio = min(pos_num, neg_num) / max(pos_num, neg_num)

#     if pos_num < neg_num:
#         for label, indices in neg_label2indices.items():
#             sample_size = ceil(sample_ratio * len(indices))
#             neg_label2indices[label] = random.sample(indices, sample_size)
#     else:
#         for label, indices in pos_label2indices.items():
#             sample_size = ceil(sample_ratio * len(indices))
#             pos_label2indices[label] = random.sample(indices, sample_size)

#     def _map_labels_to_pos(batch):
#         batch['label'] = [pos_label for _ in range(len(batch['label']))]
#         return batch
    
#     def _map_labels_to_neg(batch):
#         batch['label'] = [neg_label for _ in range(len(batch['label']))]
#         return batch

#     dataset_balanced_binarized = concatenate_datasets(
#               [dataset.select(indices)
#                       .map(_map_labels_to_pos, batched=True, num_proc=4) 
#                for indices in pos_label2indices.values()] 
#             + [dataset.select(indices)
#                       .map(_map_labels_to_neg, batched=True, num_proc=4) 
#                for indices in neg_label2indices.values()]
#         )

#     return dataset_balanced_binarized.shuffle(seed=shuffle_seed)


def tokenize_premises_and_hypotheses(
      batch: Dict[str, List]
    , tokenizer: PreTrainedTokenizer
):
    # assumes all labels in the batch are available in `label_to_id`

    return tokenizer(
          text=batch['premise']
        , text_pair=batch['hypothesis']
        , truncation=True
        , max_length=tokenizer.model_max_length
        , padding=False                          # pad later dynamically with collator
        , return_attention_mask=True
        , return_token_type_ids=True
    )


def are_labels_available(
      batch: Dict[str, list]
    , avaliable_labels: List[Any]
):
    return numpy.isin(batch['label'], avaliable_labels)

In [5]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification


label_list = [ 'entailment', 'neutral', 'contradiction' ]
label_to_id = { v: i for i, v in enumerate(label_list) }
id_to_label = { v: k for k, v in label_to_id.items() }

config = AutoConfig.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_NAME
    , num_labels=len(label_list)
    , finetuning_task='text-classification'
    , cache_dir=MODEL_CACHE_DIR
    , revision='main'
)

tokenizer = AutoTokenizer.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_NAME
    , cache_dir=MODEL_CACHE_DIR
    , revision='main'
    , use_fast_tokenizer=True
)

model = AutoModelForSequenceClassification.from_pretrained(
      pretrained_model_name_or_path=PRETRAINED_MODEL_NAME
    , config=config
    , cache_dir=MODEL_CACHE_DIR
    , revision='main'
)
model.config.label2id = label_to_id
model.config.id2label = id_to_label

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
func_m = lambda batch: tokenize_premises_and_hypotheses(batch, tokenizer)
func_f = lambda batch: are_labels_available(batch, [0, 1, 2])

snli_train = snli['train'].map(func_m, batched=True, num_proc=4).filter(func_f, batched=True, num_proc=4)
snli_eval = snli['validation'].map(func_m, batched=True, num_proc=4).filter(func_f, batched=True, num_proc=4)

In [7]:
# check dataset balance

from collections import Counter
print(Counter(snli_train['label']))
print(Counter(snli_eval['label']))


Counter({0: 183416, 2: 183187, 1: 182764})
Counter({0: 3329, 2: 3278, 1: 3235})


In [8]:
import evaluate
import numpy
import torch
from transformers import EvalPrediction, Trainer, TrainingArguments


use_mixed_precision = True and torch.cuda.is_available()
print(f'Using mixed precision: {use_mixed_precision}')

training_args = TrainingArguments(
          output_dir=TRAINER_OUTPUT_DIR
        , overwrite_output_dir=True         # to overwrite the output directory
        , do_train=True
        , do_eval=True
        , eval_strategy='epoch'             # to evaluate every epoch
        , save_strategy='epoch'             # to save the model every epoch
        , learning_rate=1e-5                # equivalent to DocNLI
        , weight_decay=1e-2                 # to regularize
        , num_train_epochs=5.0              # equivalent to 2 * DocNLI
        , per_device_train_batch_size=32
        , gradient_accumulation_steps=1     # batch_size ~ this * per_device_train_epoch_batch_size
        , per_device_eval_batch_size=32
        , fp16=use_mixed_precision          # to use mixed precision training
    )

metric = evaluate.load('accuracy')

def compute_metric(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else \
            p.predictions
    preds = numpy.argmax(preds, axis=1)
    result = metric.compute(predictions=preds, references=p.label_ids)
    return result

Using mixed precision: True


In [9]:
# from os import listdir
# from os.path import isdir

# from transformers.trainer_utils import get_last_checkpoint


# last_checkpoint = None
# if isdir(training_args.output_dir):
#     last_checkpoint = get_last_checkpoint(training_args.output_dir)
#     if last_checkpoint is None and len(listdir(training_args.output_dir)) > 0:
#         raise ValueError(
#                 'Output directory ({}) already exists and is not empty. ' \
#                 'Use --overwrite_output_dir to overcome.'
#                 .format(training_args.output_dir)
#             )

In [10]:
from transformers.data import DataCollatorWithPadding

data_collator = None
if training_args.fp16:
    data_collator = DataCollatorWithPadding(tokenizer)

training_args.eval_strategy = 'steps'
training_args.eval_steps = 1_000


snli_trainer = Trainer(
          model=model
        , args=training_args
        , train_dataset=snli_train
        , eval_dataset=snli_eval
        , compute_metrics=compute_metric
        , processing_class=tokenizer
        , data_collator=data_collator
    )

In [11]:
import os
import torch

try:
    train_result = snli_trainer.train(resume_from_checkpoint=None)
    snli_trainer.save_model(output_dir=os.path.join(TRAINER_OUTPUT_DIR, 'save'))
    snli_trainer.save_metrics('train', train_result.metrics)

except KeyboardInterrupt:
    # HACK: when you interrrpt the training, GPU may not be initialized properly
    del model
    del snli_trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    raise KeyboardInterrupt('Training interrupted by user.')

Step,Training Loss,Validation Loss,Accuracy
1000,0.4391,0.324461,0.884881
2000,0.369,0.302922,0.894635
3000,0.3365,0.263451,0.909063
4000,0.3202,0.269988,0.91201
5000,0.3209,0.259936,0.903881
6000,0.3109,0.243715,0.9177
7000,0.3079,0.250886,0.91709
8000,0.3017,0.243359,0.916074
9000,0.298,0.232902,0.923085
10000,0.2997,0.247248,0.918919


KeyboardInterrupt: Training interrupted by user.