In [1]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader
import torch
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

The **GLUE Benchmark** is a group of nine classification tasks on sentences or pairs of sentences which are:

**CoLA** (Corpus of Linguistic Acceptability) Determine if a sentence is grammatically correct or not.is a dataset containing sentences labeled grammatically correct or not.

**MNLI** (Multi-Genre Natural Language Inference) Determine if a sentence entails, contradicts or is unrelated to a given hypothesis. (This dataset has two versions, one with the validation and test set coming from the same distribution, another called mismatched where the validation and test use out-of-domain data.)

**MRPC** (Microsoft Research Paraphrase Corpus) Determine if two sentences are paraphrases from one another or not.

**QNLI** (Question-answering Natural Language Inference) Determine if the answer to a question is in the second sentence or not. (This dataset is built from the SQuAD dataset.)

**QQP** (Quora Question Pairs2) Determine if two questions are semantically equivalent or not.

**RTE** (Recognizing Textual Entailment) Determine if a sentence entails a given hypothesis or not.

**SST-2** (Stanford Sentiment Treebank) Determine if the sentence has a positive or negative sentiment.

**STS-B** (Semantic Textual Similarity Benchmark) Determine the similarity of two sentences with a score from 1 to 5.

**WNLI** (Winograd Natural Language Inference) Determine if a sentence with an anonymous pronoun and a sentence with this pronoun replaced are entailed or not. (This dataset is built from the Winograd Schema Challenge dataset.)

**Metrics are:** 

- for CoLA: Matthews Correlation Coefficient

- for MNLI (matched or mismatched): Accuracy

- for MRPC: Accuracy and F1 score

- for QNLI: Accuracy

- for QQP: Accuracy and F1 score

- for RTE: Accuracy

- for SST-2: Accuracy

- for STS-B: Pearson Correlation Coefficient and Spearman's_Rank_Correlation_Coefficient


- for WNLI: Accuracy

In [2]:
glue_tasks = ["cola", "mnli", "mnli-mm",
              "mrpc", "qnli","qqp",
              "rte", "sst2", "stsb",
              "wnli"]

Notebook is designed to run on any model checkpoint from the Model Hub as long as that model has a version with a classification head. 

Depending on you model and the GPU you are using, you might need to adjust the batch size to avoid out-of-memory errors.

In [3]:
task = "cola"
model_cp = "distilbert-base-uncased"
batch_size=16

In [4]:
# get the dataset for a task, and its corresponding metric

actual_task = "mnli" if task == 'mnli-mm' else task

dataset = load_dataset('glue', actual_task)
metric = load_metric('glue', actual_task)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/cola (download: 368.14 KiB, generated: 596.73 KiB, post-processed: Unknown size, total: 964.86 KiB) to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/377k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [6]:
dataset['train'][0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [7]:
metric 
# depending on the dataset, the metric is changing and the correlation calculated are 
# different. 
# correlations available = "pearson", "spearman", "Matthews"

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [8]:
fake_preds = np.random.randint(0, 2, size=(64,))
fake_refs = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds,
              references=fake_refs)

{'matthews_correlation': 0.12855839970025792}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_cp)  # will pull only the tokenizer
tokenizer("this is 1st sentence", "followed by spidey sense of tropical tree top")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': [101, 2023, 2003, 3083, 6251, 102, 2628, 2011, 11867, 5178, 2100, 3168, 1997, 5133, 3392, 2327, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
task_to_keys = {
    glue_tasks[0]:("sentence", None),
    glue_tasks[1]:("premise", "hypothesis"),
    glue_tasks[2]:("premise", "hypothesis"),
    glue_tasks[3]:("sentence1", "sentence2"),
    glue_tasks[4]:("question", "sentence"),
    glue_tasks[5]:("question1", "question2"),
    glue_tasks[6]:("sentence1", "sentence2"),
    glue_tasks[7]:("sentence", None),
    glue_tasks[8]:("sentence1", "sentence2"),
    glue_tasks[9]:("sentence1", "sentence2"),
}

In [11]:
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
    
else:
    print(f"Sentence2: {dataset['train'][0][sentence2_key]}")
    print(f"Sentence1: {dataset['train'][0][sentence2_key]}")

Sentence: Our friends won't buy this analysis, let alone the next one we propose.


In [12]:
def preprocess_function(examples):
    """Function that tokenizes based on the type of task, 
    and truncates the sentence that is longer than the model 
    can handle"""
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], truncation=True),  tokenizer(examples[sentence2_key], truncation=True) 

In [13]:
preprocess_function(dataset['train'][:5])

{'input_ids': [[101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 2028, 2062, 18404, 2236, 3989, 2030, 1045, 1005, 1049, 3228, 2039, 1012, 102], [101, 1996, 2062, 2057, 2817, 16025, 1010, 1996, 13675, 16103, 2121, 2027, 2131, 1012, 102], [101, 2154, 2011, 2154, 1996, 8866, 2024, 2893, 14163, 8024, 3771, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [14]:
tokenised_ds = dataset.map(preprocess_function, batched=True)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [15]:
from transformers import Trainer, TrainingArguments

num_labels = 3 if task.startswith('mnli') else 1 if task == 'sstb' else 2
# task is mnle then 3, if task is sstb then 1, rest of cases 2
num_labels  # expecting 2 as it is Cola task

2024-02-02 06:11:04.304029: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-02 06:11:04.304125: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-02 06:11:04.445127: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


2

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_cp,
                                                           num_labels=num_labels)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
metric_name = "pearson" if task == 'sstb' else 'matthews_correlation' \
            if task == 'cola' else "accuracy"
metric_name

'matthews_correlation'

In [32]:
args = TrainingArguments(
    f"{model_cp}-finetune-{task}",
    evaluation_strategy='epoch',
    num_train_epochs=1,
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric,
    push_to_hub=True,
    report_to="none",
    hub_model_id=f"kamaljp/{model_cp}-finetuned-{task}",
    skip_memory_metrics=True  # this is for avoiding the threadlock error
    # https://github.com/huggingface/transformers/issues/17696
) 

In [19]:
def compute_metric(eval_pred):
    pred, refs = eval_pred
    if task != 'sstb':
        predictions = np.argmax(pred, axis=1)
    else:
        predictions = pred[:, 0]
    
    return metric.compute(predictions=predictions,
                         references=refs)

In [20]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" \
                if task == "mnli" else "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_ds['train'],
    eval_dataset=tokenised_ds[validation_key],
    compute_metrics=compute_metric,
    tokenizer=tokenizer, # this is new, and it is required for padding,
)
# if no tokenizer is provided then length mismatch occurs, leading to error

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.608177,0.458747


TypeError: cannot pickle '_thread.lock' object

In [24]:
trainer.evaluate()
trainer.push_to_hub()

TypeError: cannot pickle '_thread.lock' object

In [35]:
!pip install optuna ray[tune] >> /dev/null

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [40]:
# for hyper param search, the trainer needs to access the model as function
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_cp,
                                                              num_labels=num_labels)

In [41]:
search_trainer = Trainer(
    model_init = model_init,
    args = args,
    train_dataset=tokenised_ds['train'],
    eval_dataset=tokenised_ds[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
best_run = search_trainer.hyperparameter_search(n_trials=5, direction="maximize")

[I 2024-02-02 06:23:47,746] A new study created in memory with name: no-name-3da06757-7793-46d3-ba77-5fb38db59ce5
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Matthews Correlation
1,No log,0.615929,0.0


[W 2024-02-02 06:24:08,185] Trial 0 failed with parameters: {'learning_rate': 1.0112086904170213e-06, 'num_train_epochs': 3, 'seed': 37, 'per_device_train_batch_size': 32} because of the following error: TypeError("cannot pickle '_thread.lock' object").
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/opt/conda/lib/python3.10/site-packages/transformers/integrations/integration_utils.py", line 199, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1530, in train
    return inner_training_loop(
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.py", line 1944, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/opt/conda/lib/python3.10/site-packages/transformers/trainer.

TypeError: cannot pickle '_thread.lock' object