# Finetuning T5 with Question-Answer Dataset, Squad

I leveraged this notebook to evaluate if T5 model can be used for generating SAT style questions, especially "word-in-context" type of questions.

I used an A100 single GPU instance for this finetuning.

In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [4]:
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable


In [5]:
!pip install accelerate

Defaulting to user installation because normal site-packages is not writeable


In [6]:
!pip install tf-keras

Defaulting to user installation because normal site-packages is not writeable


## Load and process data

In [7]:
import torch
import transformers
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
model_name='t5-small'

In [9]:
from datasets import load_dataset
datasets = load_dataset('squad')
print(type(datasets))
print(datasets)

<class 'datasets.dataset_dict.DatasetDict'>
DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [10]:
# load train and validation split of squad
train_dataset  = datasets['train']
valid_dataset = datasets['validation']
len(train_dataset), len(valid_dataset)

(87599, 10570)

In [11]:
import sentencepiece
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name,legacy=False)

In [12]:
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=16)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [13]:
import os.path
if not os.path.isfile('train_data.pt') and not os.path.isfile('valid_data.pt'):
    # map add_eos_to_examples function to the dataset example wise
    train_dataset = train_dataset.map(add_eos_to_examples)
    # map convert_to_features batch wise
    train_dataset = train_dataset.map(convert_to_features, batched=True)

    valid_dataset = valid_dataset.map(add_eos_to_examples, load_from_cache_file=False)
    valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)


    # set the tensor type and the columns which the dataset should return
    columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
    train_dataset.set_format(type='torch', columns=columns)
    valid_dataset.set_format(type='torch', columns=columns)
    # cache the dataset, so we can load it directly for training
    torch.save(train_dataset, 'train_data.pt')
    torch.save(valid_dataset, 'valid_data.pt')

    len(train_dataset), len(valid_dataset)

In [14]:
import numpy as np
np.object = np.object_

In [15]:
args_dict = {
  "model_name_or_path": model_name,
  "max_len": 512 ,
  "target_max_len": 16,
  "output_dir": './fine-tuned/',
  "overwrite_output_dir": True,
  "per_device_train_batch_size": 32,
  "per_device_eval_batch_size": 32,
  "gradient_accumulation_steps": 4,
  "learning_rate": 1e-4,
  "num_train_epochs": 4,
  "do_train": True,
  "remove_unused_columns": False
}

import json
with open('args.json', 'w') as f:
  json.dump(args_dict, f)

## Write training script

In [16]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import numpy as np
import torch

from transformers import T5ForConditionalGeneration, T5Tokenizer, EvalPrediction
from transformers import (
    HfArgumentParser,
    DataCollator,
    Trainer,
    TrainingArguments,
    set_seed,
)


logger = logging.getLogger(__name__)

# prepares lm_labels from target_ids, returns examples with keys as expected by the forward method
# this is necessacry because the trainer directly passes this dict as arguments to the model
# so make sure the keys match the parameter names of the forward method
@dataclass
class DataCollator:
    def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch])
        lm_labels = torch.stack([example['target_ids'] for example in batch])
        lm_labels[lm_labels[:, :] == 0] = -100
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        decoder_attention_mask = torch.stack([example['target_attention_mask'] for example in batch])


        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': lm_labels,
            'decoder_attention_mask': decoder_attention_mask
        }


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    train_file_path: Optional[str] = field(
        default='train_data.pt',
        metadata={"help": "Path for cached train dataset"},
    )
    valid_file_path: Optional[str] = field(
        default='valid_data.pt',
        metadata={"help": "Path for cached valid dataset"},
    )
    max_len: Optional[int] = field(
        default=512,
        metadata={"help": "Max input length for the source text"},
    )
    target_max_len: Optional[int] = field(
        default=32,
        metadata={"help": "Max input length for the target text"},
    )


def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))

    # we will load the arguments from a json file,
    #make sure you save the arguments in at ./args.json
    model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    tokenizer = T5Tokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = T5ForConditionalGeneration.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    #model = T5ForConditionalGeneration.from_pretrained("./fine-tuned/checkpoint-9500")

    # Get datasets
    train_dataset  = torch.load(data_args.train_file_path)
    valid_dataset = torch.load(data_args.valid_file_path)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=DataCollator()
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)

    return results


def _mp_fn(index):
    main()

2025-01-14 04:30:38.288060: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736829038.305355    2962 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736829038.310738    2962 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Train

Start training!

In [17]:
_mp_fn(1)

01/14/2025 04:30:40 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_

Step,Training Loss
500,0.4544
1000,0.4075
1500,0.3865
2000,0.3677
2500,0.3529


## Eval

In [18]:
## SQuAD evaluation script. Modifed slightly for this notebook

from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
      total += 1
      exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
      f1 += metric_max_over_ground_truths(
          f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

In [19]:
from tqdm.auto import tqdm

In [22]:
model = T5ForConditionalGeneration.from_pretrained('./fine-tuned')
tokenizer = T5Tokenizer.from_pretrained('./fine-tuned')

In [23]:
valid_dataset = torch.load('valid_data.pt')
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=32)

  valid_dataset = torch.load('valid_data.pt')


In [None]:
answers = []
for batch in tqdm(dataloader):
  outs = model.generate(input_ids=batch['input_ids'], 
                        attention_mask=batch['attention_mask'],
                        max_length=16,
                        early_stopping=True)
  outs = [tokenizer.decode(ids) for ids in outs]
  answers.extend(outs)

  1%|          | 2/331 [01:05<3:01:11, 33.05s/it]

In [25]:
predictions = []
references = []
for ref, pred in zip(valid_dataset, answers):
  predictions.append(pred)
  references.append(ref['answers']['text'])

NameError: name 'predictions' is not defined

In [None]:
evaluate(references, predictions)

# Extras - Custom Dataset

In [None]:
raw_dataset = {
  "title": "SAT Practice Questions",
  "data": [
    {
      "id": "111111",
      "context": "The mitochondrion is often called the powerhouse of the cell. It generates energy in the form of ATP through a process called cellular respiration. This energy is used to power various cellular processes necessary for the survival and function of the cell.",
      "question": "What is the primary function of the mitochondrion?",
      "answers": {
          "answer_start": [1],
          "text":["To generate energy in the form of ATP through cellular respiration"]
          }
    },
    {
      "id": "111112",
      "context": "In William Shakespeare's play 'Hamlet,' the titular character struggles with the morality of avenging his father’s murder. This internal conflict is a central theme in the play and is most evident in the soliloquy, 'To be, or not to be.'",
      "question": "What is a central theme of Shakespeare's play 'Hamlet'?",
      "answers": {
          "answer_start": [1],
          "text":["The internal conflict of morality in avenging his father’s murder"]
          }
    },
    {
      "id": "111113",
      "context": "The Pythagorean Theorem states that in a right triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides. This relationship is fundamental in Euclidean geometry.",
      "question": "What does the Pythagorean Theorem state about the sides of a right triangle?",
      "answers": {
          "answer_start": [1],
          "text":["The square of the hypotenuse is equal to the sum of the squares of the other two sides"]
          }
    },
    {
      "id": "111114",
      "context": "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll. During this process, carbon dioxide and water are converted into glucose and oxygen.",
      "question": "What are the end products of photosynthesis?",
      "answers": {
          "answer_start": [1],
          "text":["Glucose and oxygen"]
          }
    },
    {
      "id": "111115",
      "context": "The Industrial Revolution was a period of major industrialization and technological innovation that began in the late 18th century. It significantly altered agriculture, manufacturing, and transportation, impacting societal structures.",
      "question": "What were three areas significantly altered by the Industrial Revolution?",
      "answers":{
          "answer_start": [1],
          "text": ["Agriculture, manufacturing, and transportation"]
        }
    },
    {
      "id": "111116",
      "context": "In mathematics, a prime number is a natural number greater than 1 that has no positive divisors other than 1 and itself. For example, 2, 3, and 5 are prime numbers.",
      "question": "What defines a prime number in mathematics?",
      "answers": {
          "answer_start": [1],
          "text":["A natural number greater than 1 with no positive divisors other than 1 and itself"]
        }
    },
    {
      "id": "111117",
      "context": "George Washington was the first President of the United States, serving from 1789 to 1797. He set many precedents for the national government and the presidency in particular, such as the tradition of a peaceful transfer of power.",
      "question": "What is one precedent set by George Washington during his presidency?",
      "answers":{
          "answer_start": [1],
          "text": ["The tradition of a peaceful transfer of power"]
        }
    },
    {
      "id": "111118",
      "context": "The water cycle involves processes such as evaporation, condensation, precipitation, and collection. This cycle ensures the continuous movement of water on, above, and below the surface of the Earth.",
      "question": "What are the main processes involved in the water cycle?",
      "answers":{
          "answer_start": [1],
          "text": ["Evaporation, condensation, precipitation, and collection"]
        }
    },
    {
      "id": "111119",
      "context": "Albert Einstein's theory of general relativity explains how gravity affects the fabric of space-time. It provided a new understanding of gravitational forces, replacing Newton's law of universal gravitation.",
      "question": "What does Einstein's theory of general relativity explain?",
      "answers":{
          "answer_start": [1],
          "text": ["How gravity affects the fabric of space-time"]
        }
    },
    {
      "id": "111120",
      "context": "The Great Wall of China, built over several dynasties, was constructed to protect against invasions and raids from nomadic groups. It stretches over 13,000 miles and is considered a marvel of ancient engineering.",
      "question": "What was the primary purpose of the Great Wall of China?",
      "answers":{
          "answer_start": [1],
          "text":  ["To protect against invasions and raids from nomadic groups"]
        }
    }
  ]
}


In [None]:
data_list = []

def dataset_add(question,context,answers):
    dataset = {
    "question":question,
    "context":context,
    "answers": answers
}

    return dataset

for i in range(len(raw_dataset['data'])):
  question =raw_dataset['data'][i]['question']
  context =raw_dataset['data'][i]['context']
  answers = raw_dataset['data'][i]['answers']
  #answers ={'answer_start':1,'text':[raw_dataset['data'][i]['answers'][0],raw_dataset['data'][i]['answers'][0],raw_dataset['data'][i]['answers'][0],raw_dataset['data'][i]['answers'][0]]}
  data_unit = dataset_add(question,context,answers)
  data_list.append(data_unit)

print(data_list[1]['answers']['text'][0])