In [1]:
import os
import time
import argparse
import random
from functools import partial
from collections import namedtuple

In [5]:
from transformers import AutoTokenizer

In [2]:
from datasets import load_dataset, load_metric, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [64]:
tokenizer = AutoTokenizer.from_pretrained("t5-small", use_fast=True)

In [10]:
# compute the length of the labels
tasks = ['cola', 'sst2', 'mrpc', 'qqp', 'mnli', 'qnli', 'rte', 'wnli']
tasks_to_labels = {
    'cola': ['unacceptable', 'acceptable'],
    'sst2': ['negative', 'positive'],
    'mrpc': ['not_equivalent', 'equivalent'],
    'qqp': ['not_duplicate', 'duplicate'],
    # processed differently as this is a regression task
    'sts-b': [],
    'mnli': ['entailment', 'neutral', 'contradiction'],
    'qnli': ['entailment', 'not_entailment'],
    'rte': ['entailment', 'not_entailment'],
    'wnli': ['not_entailment', 'entailment']
}

In [65]:
dataset_name = "glue"
for task in tasks:
    print(f"The task is {task}")
    dataset = load_dataset(dataset_name, task)
    labels = tasks_to_labels[task]
    for label in labels:
        print(label)
        label_tokens = tokenizer(label)
        print(label_tokens["input_ids"])
        input_ids = label_tokens["input_ids"]
        print(f"Tokenized length is {len(input_ids)}")

The task is cola
unacceptable
[29452, 1]
Tokenized length is 2
acceptable
[9961, 1]
Tokenized length is 2
The task is sst2
negative
[2841, 1]
Tokenized length is 2
positive
[1465, 1]
Tokenized length is 2
The task is mrpc
not_equivalent
[59, 834, 15, 1169, 15592, 1]
Tokenized length is 6
equivalent
[7072, 1]
Tokenized length is 2
The task is qqp
not_duplicate
[59, 834, 26, 413, 26221, 1]
Tokenized length is 6
duplicate
[19197, 1]
Tokenized length is 2
The task is mnli
entailment
[3, 35, 5756, 297, 1]
Tokenized length is 5
neutral
[7163, 1]
Tokenized length is 2
contradiction
[27252, 1]
Tokenized length is 2
The task is qnli
entailment
[3, 35, 5756, 297, 1]
Tokenized length is 5
not_entailment
[59, 834, 35, 5756, 297, 1]
Tokenized length is 6
The task is rte
entailment
[3, 35, 5756, 297, 1]
Tokenized length is 5
not_entailment
[59, 834, 35, 5756, 297, 1]
Tokenized length is 6
The task is wnli
not_entailment
[59, 834, 35, 5756, 297, 1]
Tokenized length is 6
entailment
[3, 35, 5756, 297, 

In [66]:
def glue(x, benchmark_name, label_names, feature_names=None, id_key='idx'):
    """Convert a dataset from glue to text2text examples.

    This function uses the feature names from the dataset to unpack examples into
    a format amenable for a text2text problem. For example, consider the Quora
    Question Pairs (QQP) benchmark, which would suggest
    benchmark_name="qqp"
    label_names=['not_duplicate', 'duplicate']
    For QQP, a typical example might look like
    {
        "question1": "Why do I easily get bored of my friends?",
        "question2": "Why do I get bored of friends so quickly?",
        "label": 1,
        "idx": 10,
    }

    This example would be transformed to
    {
        "inputs": (
            "qqp question1: Why do I easily get bored of my friends? question2: "
            "Why do I get bored of my friends so quickly?"
        ),
        "targets": "duplicate",
        "idx": 10,
    }

    Args:
        x: an example to process.
        benchmark_name: the name of the GLUE benchmark for this dataset.
        label_names: a list of label names corresponding to class index.
        feature_names: an optional ordered list of feature names. If provided,
        features will be ordered in this way in the output. If not provided, all
        features (except 'idx' and 'label') will be used, sorted by name.
        id_key: str, key for id in the dataset. If not provided, 'idx' will be used.
        if None, no id will be added to the dataset.

    Returns:
        A preprocessed example.
    """
    feature_keys = feature_names or sorted(set(x.keys()).difference(['label', 'idx']))
    strs_to_join = []
    for key in feature_keys:
        strs_to_join.append('{}:'.format(key))
        strs_to_join.append(x[key])
    strs_to_join.insert(0, benchmark_name)
    label_name = '<unk>' if x['label'] == -1 else label_names[x['label']]
    joined = ' '.join(strs_to_join)

    ex = {}
    ex['inputs'] = joined
    ex['targets'] = label_name

    return ex


In [67]:
def preprocess_function_decoder(examples):
        # add a separator between inputs and targets for the model to learn when to predict the targets
        inputs = examples['inputs'] + ":" + examples['targets']
        inputs = tokenizer(inputs, return_tensors='pt')

        return {'input_ids': len(inputs['input_ids'].squeeze(0))}

In [68]:
for task in tasks:
    print(f"The task is {task}")
    dataset = load_dataset(dataset_name, task)  
    glue_partial = partial(glue, benchmark_name=task, label_names=tasks_to_labels[task])
    column_names = dataset['train'].column_names
    dataset = dataset.map(glue_partial, remove_columns=column_names)
    old_columns = dataset['train'].column_names
    tokenized_dataset = dataset.map(preprocess_function_decoder, remove_columns=old_columns)
    len_ids = max([tokenized_dataset["train"][token_ids]["input_ids"] for token_ids in range(0, len(tokenized_dataset["train"]))])
    print(f"The maximum length inputs for {task} is: {len_ids}")


The task is cola


Map: 100%|██████████| 8551/8551 [00:01<00:00, 4907.11 examples/s]
Map: 100%|██████████| 1043/1043 [00:00<00:00, 4577.90 examples/s]
Map: 100%|██████████| 1063/1063 [00:00<00:00, 4579.87 examples/s]


The maximum length inputs for cola is: 54
The task is sst2


Map: 100%|██████████| 67349/67349 [00:13<00:00, 4867.59 examples/s]
Map: 100%|██████████| 872/872 [00:00<00:00, 4103.17 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 4154.31 examples/s]


The maximum length inputs for sst2 is: 93
The task is mrpc


Map: 100%|██████████| 3668/3668 [00:01<00:00, 2923.72 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2859.91 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2821.97 examples/s]


The maximum length inputs for mrpc is: 147
The task is qqp


Map: 100%|██████████| 363846/363846 [01:37<00:00, 3732.94 examples/s]
Map: 100%|██████████| 40430/40430 [00:11<00:00, 3526.55 examples/s]
Map: 100%|██████████| 390965/390965 [01:42<00:00, 3815.68 examples/s]


The maximum length inputs for qqp is: 361
The task is mnli


Map:  43%|████▎     | 170095/392702 [00:50<01:05, 3401.53 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 392702/392702 [01:56<00:00, 3367.38 examples/s]
Map: 100%|██████████| 9815/9815 [00:03<00:00, 3132.22 examples/s]
Map: 100%|██████████| 9832/9832 [00:02<00:00, 3340.66 examples/s]
Map: 100%|██████████| 9796/9796 [00:02<00:00, 3455.32 examples/s]
Map: 100%|██████████| 9847/9847 [00:02<00:00, 3390.39 examples/s]


The maximum length inputs for mnli is: 542
The task is qnli


Map: 100%|██████████| 104743/104743 [00:34<00:00, 3013.47 examples/s]
Map: 100%|██████████| 5463/5463 [00:02<00:00, 2460.94 examples/s]
Map: 100%|██████████| 5463/5463 [00:01<00:00, 2805.53 examples/s]


The maximum length inputs for qnli is: 669
The task is rte


Map: 100%|██████████| 2490/2490 [00:01<00:00, 2457.76 examples/s]
Map: 100%|██████████| 277/277 [00:00<00:00, 2251.95 examples/s]
Map: 100%|██████████| 3000/3000 [00:01<00:00, 2622.58 examples/s]


The maximum length inputs for rte is: 323
The task is wnli


Map: 100%|██████████| 635/635 [00:00<00:00, 2996.65 examples/s]
Map: 100%|██████████| 71/71 [00:00<00:00, 2806.14 examples/s]
Map: 100%|██████████| 146/146 [00:00<00:00, 2849.03 examples/s]

The maximum length inputs for wnli is: 132





In [74]:
output_text = "prediction</s><pad><pad><pad>not_entailment<pad><pad>"
decoded_labels = "not_entailment"

generated_strings = output_text.strip().lower().split(" ")
print(f"Generated string is {generated_strings}")

#decoded_labels = tokenizer.decode(labels[idx], skip_special_tokens=True)
label_index = -1
# you will always have one decoded_label even for strings like not_entailment
if decoded_labels in output_text:
    label_index = tasks_to_labels["rte"].index(decoded_labels)
    print(f"Label index is {label_index}")

Generated string is ['prediction</s><pad><pad><pad>not_entailment<pad><pad>']
Label index is 1


In [4]:
# get prompts from flan templates 
# https://github.com/google-research/FLAN/blob/main/flan/templates.py
import templates
from preprocessors import glue, stsb, string_to_float

In [9]:
tasks = ['cola', 'sst2', 'glue_mrpc', 'glue_qqp', 'mnli', 'qnli', 'rte', 'wnli']
for task in tasks:
    # pick the first prompt in the flan template
    prompt = templates.PATTERNS[task][0]
    print(f"The prompt for task {task} is {prompt}")

The prompt for task cola is ('Sentence: "{sentence}"\nWould a linguist rate this sentence to be acceptable linguistically?\n\n{options_}', '{answer}')
The prompt for task sst2 is ('Review:\n{sentence}\nIs this movie review sentence negative or positive?\n{options_}', '{answer}')
The prompt for task glue_mrpc is ('Here are two sentences:\n{sentence1}\n{sentence2}\nDo they have the same meaning?\n{options_}', '{answer}')
The prompt for task glue_qqp is ('{question1}\n{question2}\nWould you say that these questions are the same?\n{options_}', '{answer}')
The prompt for task mnli is ('Premise: {premise}\n\nHypothesis: {hypothesis}\n\nDoes the premise entail the hypothesis?\n\n{options_}', '{answer}')
The prompt for task qnli is ('Does the sentence "{sentence}" answer the question "{question}"\n\n{options_}', '{answer}')
The prompt for task rte is ('{premise}\n\nBased on the paragraph above can we conclude that "{hypothesis}"?\n\n{options_}', '{answer}')
The prompt for task wnli is ('If "{s

In [44]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "sts-b": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [62]:
from datasets import load_dataset, load_metric, DatasetDict
benchmark_name = 'rte'
dataset = load_dataset('glue', benchmark_name)
smaller_dataset = DatasetDict()
for split in dataset.keys():
    subset_size = 10
    smaller_dataset[split] = dataset[split].shuffle(seed=42).select(range(subset_size))
dataset = smaller_dataset

data = smaller_dataset['train'][2]
print(f'{data}')

{'sentence1': 'Nokia, Texas Instruments and other leading makers of mobile phones have formally complained to Brussels that Qualcomm, the US mobile chipmaker, has unfairly used its patents on 3G technologies.', 'sentence2': 'Nokia produces mobile chips.', 'label': 1, 'idx': 1455}


In [66]:

label_names = tasks_to_labels[benchmark_name]
print(label_names)
feature_keys = task_to_keys[benchmark_name]
print(feature_keys)

if benchmark_name == 'rte':
    premise = data['sentence1']
    hypothesis = data['sentence2']
else:
    # create variables with the same name as the strings - will need it to fill in the template
    variables = task_to_keys.get(benchmark_name, ())
    locals().update({var: data[var] for var in variables})

# find out how many labels are there
num_labels = len(tasks_to_labels[benchmark_name])
print(f"The number of labels are: {num_labels}")
if num_labels == 3:
    options_ = f'A) {label_names[0]}\nB){label_names[1]}\nC){label_names[2]}' # You can modify the options as needed
    if data['label'] == 0:
        answer = 'A'
    elif data['label'] == 1:
        answer = 'B'
    elif data['label'] == 2:
        answer = 'C'
elif num_labels == 2:
    options_ = f'A) {label_names[0]}\nB) {label_names[1]}' # You can modify the options as needed
    answer = 'A' if data['label'] == 0 else 'B' # Modify according to the label mapping

prompt_template = templates.PATTERNS[benchmark_name][0][0] # extract first prompt and since its a tuple extract string
print(f'Prompt is {prompt}')
filled_prompt = prompt_template.format(
    premise=premise,
    hypothesis=hypothesis,
    sentence=locals().get('sentence', ''),
    sentence1=locals().get('sentence1', ''),
    sentence2=locals().get('sentence2', ''),
    question=locals().get('question', ''),
    question1=locals().get('question1', ''),
    question2=locals().get('question2', ''),
    options_=options_,
)

template_filled = (filled_prompt, answer)


print(f'Template filled is {template_filled}')

['entailment', 'not_entailment']
('sentence1', 'sentence2')
The number of labels are: 2
Prompt is ('{premise}\n\nBased on the paragraph above can we conclude that "{hypothesis}"?\n\n{options_}', '{answer}')
Template filled is ('Nokia, Texas Instruments and other leading makers of mobile phones have formally complained to Brussels that Qualcomm, the US mobile chipmaker, has unfairly used its patents on 3G technologies.\n\nBased on the paragraph above can we conclude that "Nokia produces mobile chips."?\n\nA) entailment\nB) not_entailment', 'B')
