In [1]:
from datasets import load_dataset, load_metric
import numpy as np

In [2]:
raw_datasets = load_dataset("glue", "rte")
raw_datasets

Found cached dataset glue (/home/przemek/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [3]:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'not_entailment'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
raw_datasets['train']['sentence1'][:3]

['No Weapons of Mass Destruction Found in Iraq Yet.',
 'A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI.',
 'Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients.']

In [5]:
#checkpoint = 'distilbert-base-cased'
checkpoint = 'bert-base-cased'

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
  Trainer, TrainingArguments, AutoConfig

In [7]:
config = AutoConfig.from_pretrained(checkpoint)

print (f'{config.id2label = }')
print (f'{config.label2id = }')

target_map = {'entailment' : 0, 'not_entailment' : 1}
config.id2label = {v:k for k,v in target_map.items()}
config.label2id = target_map

print (f'{config.id2label = }')
print (f'{config.label2id = }')


config.id2label = {0: 'LABEL_0', 1: 'LABEL_1'}
config.label2id = {'LABEL_0': 0, 'LABEL_1': 1}
config.id2label = {0: 'entailment', 1: 'not_entailment'}
config.label2id = {'entailment': 0, 'not_entailment': 1}


In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint,
    config=config)

In [9]:
tokenizer(
    raw_datasets['train']['sentence1'][0],
    raw_datasets['train']['sentence2'][0])

{'input_ids': [101, 1302, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 6355, 119, 102, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
result = _
result.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
tokenizer.decode(result['input_ids'])

'[CLS] No Weapons of Mass Destruction Found in Iraq Yet. [SEP] Weapons of Mass Destruction Found in Iraq. [SEP]'

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, 
    config=config)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [14]:
training_args = TrainingArguments(
  output_dir='training_dir_2_5',
  evaluation_strategy='epoch',
  save_strategy='epoch',
  num_train_epochs=5,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=64,
  logging_steps=78
)

In [15]:
metric = load_metric("glue", "rte")
metric.compute(predictions=[1, 0, 1], references=[1, 0, 0])


  metric = load_metric("glue", "rte")


{'accuracy': 0.6666666666666666}

In [16]:
from sklearn.metrics import f1_score

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    
    return {
        'accuracy': np.mean(labels == predictions),
        'f1': f1_score(y_true=labels, y_pred=predictions)
    }

In [17]:
def tokenize_fn(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True)

In [18]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Loading cached processed dataset at /home/przemek/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-ccfe7b958a26f069.arrow
Loading cached processed dataset at /home/przemek/.cache/huggingface/datasets/glue/rte/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-57e0fbc25bfe9f79.arrow


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [19]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6733,0.623263,0.635379,0.516746
2,0.49,0.639029,0.635379,0.551111
3,0.2445,0.750539,0.66787,0.629032
4,0.1152,1.109786,0.700361,0.702509
5,0.0655,1.175997,0.66787,0.651515




TrainOutput(global_step=390, training_loss=0.3177151557726738, metrics={'train_runtime': 143.7654, 'train_samples_per_second': 86.599, 'train_steps_per_second': 2.713, 'total_flos': 1199491440281880.0, 'train_loss': 0.3177151557726738, 'epoch': 5.0})

In [21]:
!ls training_dir_2_5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
checkpoint-156	checkpoint-234	checkpoint-312	checkpoint-390	checkpoint-78


In [22]:
from transformers import pipeline
p = pipeline('text-classification', model='training_dir_2_5/checkpoint-312', device=0)

In [23]:
p({'text': 'I went to the store', 'text_pair': 'I am a bird'})

{'label': 'not_entailment', 'score': 0.9802959561347961}

In [29]:
p({'text': 'I like cheese', 'text_pair': 'I eat it'})

{'label': 'entailment', 'score': 0.5525752902030945}