## Preprocessing

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from evaluate import load
import torch

## Importing a test model. Here I used flan-t5-base

model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# For evaluation i used SuperGLUE. It is a industry standard metric to test the performance of LLMs
# This the Paper: https://w4ngatang.github.io/static/papers/superglue.pdf
# For this Notebook i tried boolq which simple True or flase questions
dataset = load_dataset('super_glue', 'boolq', trust_remote_code=True)



In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 9427
    })
    validation: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3270
    })
    test: Dataset({
        features: ['question', 'passage', 'idx', 'label'],
        num_rows: 3245
    })
})

In [3]:
import regex

def add_instruction(examples):
    """
    This adds additional promts to the text
    """
    examples['question'] = ["The following question is simple Yes or no question:\n"+q + " \nReply 1 for true and 0 for false." for q in examples['question']]
    return examples

def tokenize_fn(examples):
    """
    Tokenizes the input
    """
    return tokenizer(examples['question'], examples['passage'], truncation=True, padding=True)


def get_predictions(example):
    """ 
    Genrates predictions
    """
    inputs = tokenizer(example['question'], example['passage'], return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        output = model.generate(**inputs)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def convert_predictions_to_label(prediction):
    """ 
    Uses generated output and convertes it to 1 and 0 for inference.
    
    """
    prediction = regex.sub(r'\W+', '', prediction)
    prediction=str(prediction)
    return 1 if prediction.lower() in ['true','yes',"1"] else 0


In [4]:

dataset = dataset.map(add_instruction, batched=True)


tokenized_dataset = dataset['validation'].map(tokenize_fn, batched=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Map:   0%|          | 0/3245 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [5]:
dataset['validation'][0]

{'question': 'The following question is simple Yes or no question:\ndoes ethanol take more energy make that produces \nReply 1 for true and 0 for false.',
 'passage': "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, th

In [6]:
tokenized_dataset

Dataset({
    features: ['question', 'passage', 'idx', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3270
})

In [7]:

tokenized_dataset = tokenized_dataset.map(lambda x: {'prediction': get_predictions(x)})


Map:   0%|          | 0/3270 [00:00<?, ? examples/s]



In [8]:
tokenized_dataset = tokenized_dataset.map(lambda x: {"prediction": convert_predictions_to_label(x['prediction'])})


Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset['prediction']

[0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [10]:
accuracy_metric = load('f1')
predictions = tokenized_dataset['prediction']
references = tokenized_dataset['label']

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [12]:
results = accuracy_metric.compute(predictions=predictions, references=references)
print("Accuracy:", results)

Accuracy: {'f1': 0.5084745762711864}


In [13]:
from evaluate import load
super_glue_metric=load('super_glue', 'boolq')

super_glue_metric.compute(predictions=predictions, references=references)

{'accuracy': 0.5654434250764526}

In [13]:
tokenized_dataset.save_to_disk("tokenized_dataset.hf")

Saving the dataset (0/1 shards):   0%|          | 0/3270 [00:00<?, ? examples/s]