In [1]:
import os
import time
import argparse
import random
from functools import partial
from collections import namedtuple

In [5]:
from transformers import AutoTokenizer

In [2]:
from datasets import load_dataset, load_metric, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
tokenizer = AutoTokenizer.from_pretrained("t5-base", use_fast=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [20]:
# compute the length of the labels
tasks = ['cola', 'sst2', 'mrpc', 'qqp', 'mnli', 'qnli', 'rte', 'wnli']
tasks_to_labels = {
    'cola': ['unacceptable', 'acceptable'],
    'sst2': ['negative', 'positive'],
    'mrpc': ['not_equivalent', 'equivalent'],
    'qqp': ['not_duplicate', 'duplicate'],
    # processed differently as this is a regression task
    'sts-b': [],
    'mnli': ['entailment', 'neutral', 'contradiction'],
    'qnli': ['entailment', 'not_entailment'],
    'rte': ['entailment', 'not_entailment'],
    'wnli': ['not_entailment', 'entailment']
}
dataset_name = "glue"
for task in tasks:
    print(f"The task is {task}")
    dataset = load_dataset(dataset_name, task)
    labels = tasks_to_labels[task]
    for label in labels:
        print(label)
        label_tokens = tokenizer(label)
        print(label_tokens["input_ids"])
        input_ids = label_tokens["input_ids"]
        print(f"Tokenized length is {len(input_ids)}")

The task is cola
unacceptable
[29452, 1]
Tokenized length is 2
acceptable
[9961, 1]
Tokenized length is 2
The task is sst2
negative
[2841, 1]
Tokenized length is 2
positive
[1465, 1]
Tokenized length is 2
The task is mrpc
not_equivalent
[59, 834, 15, 1169, 15592, 1]
Tokenized length is 6
equivalent
[7072, 1]
Tokenized length is 2
The task is qqp
not_duplicate
[59, 834, 26, 413, 26221, 1]
Tokenized length is 6
duplicate
[19197, 1]
Tokenized length is 2
The task is mnli
entailment
[3, 35, 5756, 297, 1]
Tokenized length is 5
neutral
[7163, 1]
Tokenized length is 2
contradiction
[27252, 1]
Tokenized length is 2
The task is qnli
entailment
[3, 35, 5756, 297, 1]
Tokenized length is 5
not_entailment
[59, 834, 35, 5756, 297, 1]
Tokenized length is 6
The task is rte
entailment
[3, 35, 5756, 297, 1]
Tokenized length is 5
not_entailment
[59, 834, 35, 5756, 297, 1]
Tokenized length is 6
The task is wnli
not_entailment
[59, 834, 35, 5756, 297, 1]
Tokenized length is 6
entailment
[3, 35, 5756, 297, 

In [30]:
def glue(x, benchmark_name, label_names, feature_names=None, id_key='idx'):
    """Convert a dataset from glue to text2text examples.

    This function uses the feature names from the dataset to unpack examples into
    a format amenable for a text2text problem. For example, consider the Quora
    Question Pairs (QQP) benchmark, which would suggest
    benchmark_name="qqp"
    label_names=['not_duplicate', 'duplicate']
    For QQP, a typical example might look like
    {
        "question1": "Why do I easily get bored of my friends?",
        "question2": "Why do I get bored of friends so quickly?",
        "label": 1,
        "idx": 10,
    }

    This example would be transformed to
    {
        "inputs": (
            "qqp question1: Why do I easily get bored of my friends? question2: "
            "Why do I get bored of my friends so quickly?"
        ),
        "targets": "duplicate",
        "idx": 10,
    }

    Args:
        x: an example to process.
        benchmark_name: the name of the GLUE benchmark for this dataset.
        label_names: a list of label names corresponding to class index.
        feature_names: an optional ordered list of feature names. If provided,
        features will be ordered in this way in the output. If not provided, all
        features (except 'idx' and 'label') will be used, sorted by name.
        id_key: str, key for id in the dataset. If not provided, 'idx' will be used.
        if None, no id will be added to the dataset.

    Returns:
        A preprocessed example.
    """
    feature_keys = feature_names or sorted(set(x.keys()).difference(['label', 'idx']))
    strs_to_join = []
    for key in feature_keys:
        strs_to_join.append('{}:'.format(key))
        strs_to_join.append(x[key])
    strs_to_join.insert(0, benchmark_name)
    label_name = '<unk>' if x['label'] == -1 else label_names[x['label']]
    joined = ' '.join(strs_to_join)

    ex = {}
    ex['inputs'] = joined
    ex['targets'] = label_name

    return ex


In [37]:
def preprocess_function_decoder(examples):
        # add a separator between inputs and targets for the model to learn when to predict the targets
        inputs = examples['inputs'] + ":" + examples['targets']
        inputs = tokenizer(inputs, return_tensors='pt')

        return {'input_ids': len(inputs['input_ids'].squeeze(0))}

In [63]:
for task in tasks:
    print(f"The task is {task}")
    dataset = load_dataset(dataset_name, task)  
    glue_partial = partial(glue, benchmark_name=task, label_names=tasks_to_labels[task])
    column_names = dataset['train'].column_names
    dataset = dataset.map(glue_partial, remove_columns=column_names)
    old_columns = dataset['train'].column_names
    tokenized_dataset = dataset.map(preprocess_function_decoder, remove_columns=old_columns)
    len_ids = max([tokenized_dataset["train"][token_ids]["input_ids"] for token_ids in range(0, len(tokenized_dataset["train"]))])
    print(f"The maximum length inputs for {task} is: {len_ids}")


The task is cola
The maximum length inputs for cola is: 54
The task is sst2


Map: 100%|██████████| 67349/67349 [00:13<00:00, 4930.74 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 3944.52 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 4261.65 examples/s]


The maximum length inputs for sst2 is: 93
The task is mrpc


Map: 100%|██████████| 3668/3668 [00:01<00:00, 2895.26 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2861.30 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2879.94 examples/s]


The maximum length inputs for mrpc is: 147
The task is qqp


Map: 100%|██████████| 363846/363846 [00:20<00:00, 17531.13 examples/s]
Map: 100%|██████████| 40430/40430 [00:02<00:00, 18143.90 examples/s]
Map: 100%|██████████| 390965/390965 [00:21<00:00, 18260.71 examples/s]
Map: 100%|██████████| 363846/363846 [01:36<00:00, 3758.73 examples/s]
Map: 100%|██████████| 40430/40430 [00:10<00:00, 3816.95 examples/s]
Map: 100%|██████████| 390965/390965 [01:41<00:00, 3849.78 examples/s]


The maximum length inputs for qqp is: 361
The task is mnli


Map: 100%|██████████| 392702/392702 [00:21<00:00, 18026.81 examples/s]
Map: 100%|██████████| 9815/9815 [00:00<00:00, 16253.02 examples/s]
Map: 100%|██████████| 9832/9832 [00:00<00:00, 17553.48 examples/s]
Map: 100%|██████████| 9796/9796 [00:00<00:00, 17195.07 examples/s]
Map: 100%|██████████| 9847/9847 [00:00<00:00, 17857.93 examples/s]
Map:  43%|████▎     | 170046/392702 [00:51<01:05, 3411.37 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 392702/392702 [01:57<00:00, 3332.28 examples/s]
Map: 100%|██████████| 9815/9815 [00:02<00:00, 3439.13 examples/s]
Map: 100%|██████████| 9832/9832 [00:02<00:00, 3357.31 examples/s]
Map: 100%|██████████| 9796/9796 [00:02<00:00, 3477.49 examples/s]
Map: 100%|██████████| 9847/9847 [00:02<00:00, 3416.10 examples/s]


The maximum length inputs for mnli is: 542
The task is qnli


Map: 100%|██████████| 104743/104743 [00:05<00:00, 17736.19 examples/s]
Map: 100%|██████████| 5463/5463 [00:00<00:00, 16099.65 examples/s]
Map: 100%|██████████| 5463/5463 [00:00<00:00, 17406.87 examples/s]
Map: 100%|██████████| 104743/104743 [00:34<00:00, 3025.53 examples/s]
Map: 100%|██████████| 5463/5463 [00:01<00:00, 2948.05 examples/s]
Map: 100%|██████████| 5463/5463 [00:01<00:00, 2992.26 examples/s]


The maximum length inputs for qnli is: 669
The task is rte


Map: 100%|██████████| 2490/2490 [00:00<00:00, 14698.77 examples/s]
Map: 100%|██████████| 277/277 [00:00<00:00, 10543.33 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 16932.07 examples/s]
Map: 100%|██████████| 2490/2490 [00:01<00:00, 2380.24 examples/s]
Map: 100%|██████████| 277/277 [00:00<00:00, 2422.97 examples/s]
Map: 100%|██████████| 3000/3000 [00:01<00:00, 2596.41 examples/s]


The maximum length inputs for rte is: 323
The task is wnli


Map: 100%|██████████| 635/635 [00:00<00:00, 9761.63 examples/s]
Map: 100%|██████████| 71/71 [00:00<00:00, 6362.20 examples/s]
Map: 100%|██████████| 146/146 [00:00<00:00, 11268.99 examples/s]
Map: 100%|██████████| 635/635 [00:00<00:00, 3259.52 examples/s]
Map: 100%|██████████| 71/71 [00:00<00:00, 1845.73 examples/s]
Map: 100%|██████████| 146/146 [00:00<00:00, 2352.63 examples/s]

The maximum length inputs for wnli is: 132



