<a href="https://colab.research.google.com/github/RishabhMaheshwary/flan-t5-small/blob/main/Experiments_with_google_flan_t5_small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Install Dependencies

!pip install transformers
!pip install evaluate
!pip install datasets
!pip install nltk
!pip install SentencePiece
!pip install accelerate -U
!pip install tqdm

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.3 MB/s[0m eta [36m0:00:0

In [7]:
#Imports

import copy
import numpy as np
import nltk

from nltk.tokenize import sent_tokenize
nltk.download("punkt")

import torch
from torch import nn
from torch.utils.data import DataLoader
import evaluate
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    default_data_collator,
    Seq2SeqTrainingArguments,
    GenerationConfig,
    AdamW,
    get_scheduler
)

from transformers.models.t5.modeling_t5 import (
    T5Stack,
    T5Block,
    T5LayerNorm,
    T5Config
)

from datasets import load_dataset
from tqdm.auto import tqdm
seed = 43
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Zeroshot Evalution of flan-t5-small.

1. Loading  ```google/flan-t5-small``` model from huggingface.

In [9]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

2. Evaluating the ```google/flan-t5-small``` on summarization task.
The cell below loads a summarization dataset ```cnn_dailymail``` and then selects a sample at random from the validation set. It uses a prompt 'Summarize the following: \<passage\>' and runs the model to generate the summary in a zero-shot setting.



In [18]:
def evaluate_summarization():

    dataset = load_dataset("cnn_dailymail", "3.0.0", split='validation')

    prompt = 'Summarize the following:\n[PASSAGE]\n.'
    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]

    context, summary = example['article'], example['highlights']

    prompt = prompt.replace("[PASSAGE]", context)
    input_ids = tokenizer(prompt, max_length=1024, truncation=True, return_tensors="pt").input_ids.to(device)

    generation_config = GenerationConfig(early_stopping=False, length_penalty= 0.0, early_stop=False, max_length=50)

    outputs = model.generate(input_ids, generation_config=generation_config)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    print("Input:\n", prompt)
    print("Predicted Summary:", result)
    print("Reference Summary:", summary)
    print("\n")
evaluate_summarization()

Input:
 Summarize the following:
Cristiano Ronaldo made his commute to work look easy when he set off for Real Madrid training in his £330,000 white Rolls-Royce Ghost on Wednesday morning. The three-time Ballon d'Or winner left for the club's training ground in style as Carlo Ancelotti prepares his men for their La Liga match against Athletic Bilbao on Saturday evening. Ronaldo scored his 30th league goal in Real's last game against Villarreal but could not prevent them from dropping two points in the title race with a 1-1 draw at the Bernabeu. Cristiano Ronaldo made his commute to work look easy as he left in a white Rolls-Royce Ghost . Price: £225,000-£330,000 . Engine: 6.6-litre V12 . Top speed: 155mph . Bhp: 453 . 0-62mph: 4.9sec . With 12 games left Ronaldo is now just one goal from his tally for last season, and the former Manchester United star has become the first player in history from the big five leagues to score 30 goals per season for a fifth consecutive year. Yet Real rem

3. Evaluating the ```google/flan-t5-small``` on translation task.
The cell below loads an english to french dataset ```MuST-C-fr``` and then selects a sample at random from the validation set. It uses a prompt 'Translate the sentence from French to English: \<sentence\>' and runs the model to generate the translation in a zero-shot setting.

In [14]:
def evaluate_translation():

    dataset = dataset = load_dataset("enimai/MuST-C-fr", split='validation')

    prompt = 'Translate this sentence from French to English:\n[SENTENCE]\n.'
    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]

    english, french = example['en'], example['fr']

    prompt = prompt.replace("[SENTENCE]", french)
    input_ids = tokenizer(prompt, max_length=512, truncation=True, return_tensors="pt").input_ids.to(device)

    outputs = model.generate(input_ids)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    print("Input:\n", prompt)
    print("Predicted Translation:", result)
    print("Reference Translation:", english)
    print("\n")
evaluate_translation()

Input:
 Translate this sentence from French to English:
Mais quand j'utilise le mot miracle, je ne veux pas dire quelque chose d'impossible.
.
Predicted Translation: ['But when I use the miracle, I do not want to make any choice of it.']
Reference Translation: Now, when I use the term "miracle," I don't mean something that's impossible.




4. Evaluating the ```google/flan-t5-small``` on QA task.
The cell below loads a QA dataset ```squad``` and then selects a sample at random from the validation set. It uses a prompt 'Given the following: \<passage\>. Answer the following: \<question\>.' and runs the model to generate the answer in a zero-shot setting.

In [21]:
def evaluate_squad(model):

    dataset = load_dataset("squad", split='validation')

    prompt = 'Given the following:\n[PASSAGE].\nAnswer the following:\n[QUESTION].\n'
    example = dataset[int(np.random.randint(len(dataset), size=1)[0])]

    context, question, actual_answer = example['context'],  example['question'], example["answers"]

    prompt = prompt.replace("[PASSAGE]", context)
    prompt = prompt.replace("[QUESTION]", question)
    input_ids = tokenizer(prompt, max_length=512, truncation=True, return_tensors="pt").input_ids.to(device)

    outputs = model.generate(input_ids)
    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    print("Input:\n", prompt)
    print("Predicted Answer:", result)
    print("Reference Answers:", actual_answer)
    print("\n")
evaluate_squad(model)

Input:
 Given the following:
In the United States, the industry in 2014 has around $960 billion in annual revenue according to statistics tracked by the Census Bureau, of which $680 billion is private (split evenly between residential and nonresidential) and the remainder is government. As of 2005, there were about 667,000 firms employing 1 million contractors (200,000 general contractors, 38,000 heavy, and 432,000 specialty); the average contractor employed fewer than 10 employees. As a whole, the industry employed an estimated 5.8 million as of April 2013, with a 13.2% unemployment rate. In the United States, approximately 828,000 women were employed in the construction industry as of 2011..
Answer the following:
How much revenue is private?.

Predicted Answer: ['$680 billion']
Reference Answers: {'text': ['$680 billion', '$680 billion', '$680 billion'], 'answer_start': [148, 148, 148]}




## Parameters of the flan-t5-small.

5. The cell below prints all the layers and their dimension size in the ```google/flan-t5-small```.

In [22]:
for name, param in model.named_parameters():
    if 'weight' in name:  # Print only for weight parameters
        print(f"Layer: {name} - Dimensions: {param.size()}")

Layer: shared.weight - Dimensions: torch.Size([32128, 512])
Layer: encoder.block.0.layer.0.SelfAttention.q.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.k.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.v.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.o.weight - Dimensions: torch.Size([512, 384])
Layer: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight - Dimensions: torch.Size([32, 6])
Layer: encoder.block.0.layer.0.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_0.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_1.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wo.weight - Dimensions: torch.Size([512, 1024])
Layer: encoder.block.0.layer.1.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.blo

6. The cell below prints the total number of parameters in ```google/flan-t5-small```.

In [25]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 76961152


7. The cell below sets the weights of the final LayerNorm to zero.

In [24]:
model.decoder.final_layer_norm.weight.data.fill_(0)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

7. With the final layer LayerNorm set to 0, the cell below evaluated the model for QA task. The model generates an empty string instead of the answer "" (generated earlier) for the same question, context pair.

In [26]:
evaluate_squad(model)
evaluate_squad(model)

Input:
 Given the following:
With Rivera having been a linebacker with the Chicago Bears in Super Bowl XX, and Kubiak replacing Elway at the end of the Broncos' defeats in Super Bowls XXI and XXIV, this will be the first Super Bowl in which both head coaches played in the game themselves..
Answer the following:
What team did Rivera play for in Super Bowl XX?.

Predicted Answer: ['']
Reference Answers: {'text': ['Chicago Bears', 'the Chicago Bears', 'Bears'], 'answer_start': [46, 42, 54]}


Input:
 Given the following:
The name Rijn, from here on, is used only for smaller streams farther to the north, which together formed the main river Rhine in Roman times. Though they retained the name, these streams no longer carry water from the Rhine, but are used for draining the surrounding land and polders. From Wijk bij Duurstede, the old north branch of the Rhine is called Kromme Rijn ("Bent Rhine") past Utrecht, first Leidse Rijn ("Rhine of Leiden") and then, Oude Rijn ("Old Rhine"). The lat

8. The Cell below replaces the final layer of dimension 512 with a dimension of size 128.

    a. The class ```CustomLayerNorm``` adds a linear layer to map the input from 512 to 128 dimension. It then adds a LayerNorm of size 128.

    b. The class ```CustomDecoder``` extends ```T5Stack``` and adds the CustomLayerNorm at the top. As the output from the ```CustomLayerNorm``` is of size 128, it adds another linear to map the input from 128 to the ```vocab_size``` of the model.

    c. The class ```CustomT5``` extends ```T5ForConditionalGeneration``` and combines the encoder with the ```CustomDecoder```.

    d. The class ```CustomDecoder``` only overrides the final layer norm and class```CustomT5``` only overrides the decoder. The rest of the functions of ```google/flan-t5-small``` remains same.

In [27]:
class CustomLayerNorm(nn.Module):
    def __init__(self, hidden_size, input_dim=512, eps=1e-6):
        """
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        """
        super().__init__()
        self.projection = nn.Linear(input_dim, hidden_size)
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        hidden_states = self.projection(hidden_states)

        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # convert into half-precision if necessary
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states

class CustomDecoder(T5Stack):
    def __init__(self, config, embed_tokens=None, old_dim=512, new_dim=128, vocab_size=31522):
        super().__init__(config, embed_tokens=embed_tokens)

        self.embed_tokens = embed_tokens
        self.is_decoder = config.is_decoder
        self.new_dim = new_dim

        self.block = nn.ModuleList(
            [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
        )
        self.final_layer_norm = CustomLayerNorm(new_dim, input_dim=config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

        # Initialize weights and apply final processing
        self.post_init()
        # Model parallel
        self.model_parallel = False
        self.device_map = None
        self.gradient_checkpointing = False
        # self.output_layer = nn.Linear(new_dim, vocab_size)

class CustomT5(T5ForConditionalGeneration):
    def __init__(self, config: T5Config):
        super().__init__(config)

        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = CustomDecoder(decoder_config, self.shared)

        if hasattr(self.decoder, 'new_dim'):
            new_dim = self.decoder.new_dim
        else:
            new_dim = decoder_config.d_model

        self.lm_head = nn.Linear(new_dim, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

8. This cell prints the layer and its dimension size of the ```CustomT5``` model above. The output shows the projection layer added, changes in the final layer norm and the last layer.

In [28]:
custom_model = CustomT5(model.config)
for name, param in custom_model.named_parameters():
    if 'weight' in name:  # Print only for weight parameters
        print(f"Layer: {name} - Dimensions: {param.size()}")

Layer: shared.weight - Dimensions: torch.Size([32128, 512])
Layer: encoder.block.0.layer.0.SelfAttention.q.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.k.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.v.weight - Dimensions: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.o.weight - Dimensions: torch.Size([512, 384])
Layer: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight - Dimensions: torch.Size([32, 6])
Layer: encoder.block.0.layer.0.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_0.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_1.weight - Dimensions: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wo.weight - Dimensions: torch.Size([512, 1024])
Layer: encoder.block.0.layer.1.layer_norm.weight - Dimensions: torch.Size([512])
Layer: encoder.blo

8. Evaluating ```CustomT5``` for QA task model. As the final layer norm, and last layers are initialized with random weights, the output is random.

In [29]:
custom_model.to(device)
evaluate_squad(custom_model)

Input:
 Given the following:
Parliamentary time is also set aside for question periods in the debating chamber. A "General Question Time" takes place on a Thursday between 11:40 a.m. and 12 p.m. where members can direct questions to any member of the Scottish Government. At 2.30pm, a 40-minute long themed "Question Time" takes place, where members can ask questions of ministers in departments that are selected for questioning that sitting day, such as health and justice or education and transport. Between 12 p.m. and 12:30 p.m. on Thursdays, when Parliament is sitting, First Minister's Question Time takes place. This gives members an opportunity to question the First Minister directly on issues under their jurisdiction. Opposition leaders ask a general question of the First Minister and then supplementary questions. Such a practice enables a "lead-in" to the questioner, who then uses their supplementary question to ask the First Minister any issue. The four general questions available 

## Finetuning For SQUAD.

In [18]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

Loading SQUAD dataset

In [19]:
dataset = load_dataset("squad")
print("Context: ", dataset["train"][0]["context"])
print("Question: ", dataset["train"][0]["question"])
print("Answer: ", dataset["train"][0]["answers"])

Context:  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Answer:  {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


*   Writing a preprocess function to tokenize the input, construct ```input_ids``` and ```labels```. The pad tokens are set to -100 so that they are automatically ignored by the pytorch loss function.
*   The prompt used is: ```SQUAD Reading Comprehension Task\n\nGiven the following passage:\n{context}\nAnswer the question:\n{question}\n```.

* As ```google/flan-t5-small``` is finetuned with instruction tuning, the above prompt is like an instruction - describing the task that the model is going to be finetuned on. This enables the model to understand the task and follow the instructions.

In [20]:
def preprocess_function(examples, padding="max_length", max_input_length=512, max_target_length=32):

    answers = [example['text'][0] for example in examples["answers"]]
    questions = [q.strip() for q in examples["question"]]
    contexts = [context.strip() for context in examples["context"]]

    assert len(questions) == len(contexts)
    prompts = [f'SQUAD Reading Comprehension Task\n\nGiven the following passage:\n{context}\nAnswer the question:\n{question}\n'
               for context, question in list(zip(contexts, questions))]

    inputs = tokenizer(
        prompts,
        max_length=max_input_length,
        truncation=True,
        padding=True,
    )
    labels = tokenizer(answers, max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    inputs["labels"] = labels["input_ids"]
    return inputs

Using the evaluate huggingface library, the code below de-tokenize the predicted answers and computes the f1 and exact match scores.

In [21]:

# Metric
metric = evaluate.load("squad")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(preds, labels, valiation_set):
    final_preds = []

    for pred_list in preds:
        cur_pred = []
        for token in pred_list:
            if token == tokenizer.eos_token_id:
                break
            cur_pred.append(token)
        final_preds.append(cur_pred)
    decoded_preds = tokenizer.batch_decode(final_preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    for label_list in labels:
        for i in range(len(label_list)):
            if label_list[i] == -100:
                label_list[i] = tokenizer.pad_token_id
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    assert len(decoded_preds) == len(valiation_set)
    references = []
    final_preds = []

    for i, pred in enumerate(decoded_preds):
        final_preds.append({
            "id": validation_set[i]['id'],
            "prediction_text": pred
        })
        references.append({
            "id": validation_set[i]['id'],
            "answers": validation_set[i]['answers']
        })

    result = metric.compute(predictions=final_preds, references=references)
    print(result)
    return result

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Tokenize the dataset

In [22]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Collate function to be used by the dataloader for constructing the batches.

In [23]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [24]:
validation_set = [example for example in tokenized_dataset["validation"]]

Removing the fields that re not required.

In [25]:
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns(["title","id", "context", "question", "answers"])
tokenized_dataset["validation"] = tokenized_dataset["validation"].remove_columns(["title", "id", "context", "question", "answers"])

Building the train and validation dataloader.

In [26]:
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator
)

Using the AdamW optimizer with a lr of 5e-5 that linearly decays for ~10K steps.

In [27]:
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

10950




Evaluation function used to evaluate after every 1000 steps.

In [28]:
def evaluate_model():
    model.eval()
    predictions = []
    labels = []
    print("Running Evaluation")
    for batch in tqdm(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        val_loss = outputs.loss
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.tolist())
        labels.extend(batch['labels'].tolist())
    print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{num_training_steps}], Val_loss: {val_loss.item():.4f}")
    compute_metrics(predictions, labels, validation_set)
    model.train()

Training

In [None]:
progress_bar = tqdm(range(num_training_steps))
step = 0
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        if step%100 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}/{num_training_steps}], lr: {optimizer.param_groups[0]['lr']}, Loss: {loss.item()}")
        if step > 0 and step%1000 == 0:
            evaluate_model()
        step+=1

    model.save_pretrained(f"checkpoint_{epoch}")


  0%|          | 0/10950 [00:00<?, ?it/s]

Epoch [1/1], Step [1/10950], lr: 4.999543378995434e-05, Loss: 0.36001577973365784
Epoch [1/1], Step [101/10950], lr: 4.9538812785388126e-05, Loss: 0.592479944229126
Epoch [1/1], Step [201/10950], lr: 4.908219178082192e-05, Loss: 0.5683993101119995
Epoch [1/1], Step [301/10950], lr: 4.862557077625571e-05, Loss: 0.5309613347053528
Epoch [1/1], Step [401/10950], lr: 4.8168949771689495e-05, Loss: 0.6972126960754395
Epoch [1/1], Step [501/10950], lr: 4.771232876712329e-05, Loss: 0.9026136994361877
Epoch [1/1], Step [601/10950], lr: 4.725570776255708e-05, Loss: 0.5434806942939758
Epoch [1/1], Step [701/10950], lr: 4.679908675799087e-05, Loss: 0.7079184055328369
Epoch [1/1], Step [801/10950], lr: 4.6342465753424656e-05, Loss: 0.6725529432296753
Epoch [1/1], Step [901/10950], lr: 4.588584474885845e-05, Loss: 0.38622960448265076
Epoch [1/1], Step [1001/10950], lr: 4.542922374429224e-05, Loss: 0.3367079198360443
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [1001/10950], Val_loss: 1.2293
{'exact_match': 61.40964995269631, 'f1': 82.92882159646418}
Epoch [1/1], Step [1101/10950], lr: 4.4972602739726025e-05, Loss: 0.7071427702903748
Epoch [1/1], Step [1201/10950], lr: 4.451598173515982e-05, Loss: 0.22845439612865448
Epoch [1/1], Step [1301/10950], lr: 4.405936073059361e-05, Loss: 0.30110278725624084
Epoch [1/1], Step [1401/10950], lr: 4.3602739726027394e-05, Loss: 0.3922312259674072
Epoch [1/1], Step [1501/10950], lr: 4.3146118721461186e-05, Loss: 0.35512199997901917
Epoch [1/1], Step [1601/10950], lr: 4.268949771689498e-05, Loss: 0.23062436282634735
Epoch [1/1], Step [1701/10950], lr: 4.223287671232877e-05, Loss: 0.3516896069049835
Epoch [1/1], Step [1801/10950], lr: 4.1776255707762555e-05, Loss: 0.5046722888946533
Epoch [1/1], Step [1901/10950], lr: 4.131963470319635e-05, Loss: 0.7041322588920593
Epoch [1/1], Step [2001/10950], lr: 4.086301369863014e-05, Loss: 0.18999803066253662
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [2001/10950], Val_loss: 0.8357
{'exact_match': 61.53263954588458, 'f1': 82.64066106983107}
Epoch [1/1], Step [2101/10950], lr: 4.0406392694063925e-05, Loss: 0.29641860723495483
Epoch [1/1], Step [2201/10950], lr: 3.9949771689497717e-05, Loss: 0.4672330319881439
Epoch [1/1], Step [2301/10950], lr: 3.949315068493151e-05, Loss: 1.1251286268234253
Epoch [1/1], Step [2401/10950], lr: 3.9036529680365294e-05, Loss: 0.4827827513217926
Epoch [1/1], Step [2501/10950], lr: 3.8579908675799086e-05, Loss: 0.2608330249786377
Epoch [1/1], Step [2601/10950], lr: 3.812328767123288e-05, Loss: 0.5180926322937012
Epoch [1/1], Step [2701/10950], lr: 3.766666666666667e-05, Loss: 0.3661644458770752
Epoch [1/1], Step [2801/10950], lr: 3.7210045662100455e-05, Loss: 0.3226299583911896
Epoch [1/1], Step [2901/10950], lr: 3.675342465753425e-05, Loss: 0.47245466709136963
Epoch [1/1], Step [3001/10950], lr: 3.629680365296804e-05, Loss: 0.15409287810325623
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [3001/10950], Val_loss: 0.9471
{'exact_match': 62.75307473982971, 'f1': 83.20114365304343}
Epoch [1/1], Step [3101/10950], lr: 3.5840182648401824e-05, Loss: 0.540664553642273
Epoch [1/1], Step [3201/10950], lr: 3.5383561643835616e-05, Loss: 0.9144776463508606
Epoch [1/1], Step [3301/10950], lr: 3.492694063926941e-05, Loss: 0.8321917057037354
Epoch [1/1], Step [3401/10950], lr: 3.447031963470319e-05, Loss: 0.40719369053840637
Epoch [1/1], Step [3501/10950], lr: 3.4013698630136985e-05, Loss: 0.41773664951324463
Epoch [1/1], Step [3601/10950], lr: 3.355707762557078e-05, Loss: 0.5033920407295227
Epoch [1/1], Step [3701/10950], lr: 3.310045662100457e-05, Loss: 0.425541490316391
Epoch [1/1], Step [3801/10950], lr: 3.2643835616438354e-05, Loss: 0.500240683555603
Epoch [1/1], Step [3901/10950], lr: 3.2187214611872146e-05, Loss: 0.5143626928329468
Epoch [1/1], Step [4001/10950], lr: 3.173059360730594e-05, Loss: 0.34943756461143494
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [4001/10950], Val_loss: 0.6057
{'exact_match': 62.95175023651845, 'f1': 83.3034523743453}
Epoch [1/1], Step [4101/10950], lr: 3.1273972602739723e-05, Loss: 0.3333094120025635
Epoch [1/1], Step [4201/10950], lr: 3.0817351598173515e-05, Loss: 0.5442306995391846
Epoch [1/1], Step [4301/10950], lr: 3.0360730593607307e-05, Loss: 0.4355980455875397
Epoch [1/1], Step [4401/10950], lr: 2.9904109589041096e-05, Loss: 0.2929490804672241
Epoch [1/1], Step [4501/10950], lr: 2.9447488584474885e-05, Loss: 0.31496724486351013
Epoch [1/1], Step [4601/10950], lr: 2.8990867579908677e-05, Loss: 0.4169906675815582
Epoch [1/1], Step [4701/10950], lr: 2.8534246575342465e-05, Loss: 0.35080334544181824
Epoch [1/1], Step [4801/10950], lr: 2.8077625570776257e-05, Loss: 0.45403894782066345
Epoch [1/1], Step [4901/10950], lr: 2.7621004566210046e-05, Loss: 0.3172782361507416
Epoch [1/1], Step [5001/10950], lr: 2.7164383561643834e-05, Loss: 0.4038524329662323
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [5001/10950], Val_loss: 0.7871
{'exact_match': 63.519394512771996, 'f1': 83.75062849247668}
Epoch [1/1], Step [5101/10950], lr: 2.6707762557077626e-05, Loss: 0.6300233602523804
Epoch [1/1], Step [5201/10950], lr: 2.6251141552511415e-05, Loss: 0.25217923521995544
Epoch [1/1], Step [5301/10950], lr: 2.5794520547945207e-05, Loss: 0.40833520889282227
Epoch [1/1], Step [5401/10950], lr: 2.5337899543378995e-05, Loss: 0.7967028617858887
Epoch [1/1], Step [5501/10950], lr: 2.4881278538812784e-05, Loss: 0.46889549493789673
Epoch [1/1], Step [5601/10950], lr: 2.4424657534246576e-05, Loss: 0.5759041905403137
Epoch [1/1], Step [5701/10950], lr: 2.3968036529680365e-05, Loss: 0.5227031707763672
Epoch [1/1], Step [5801/10950], lr: 2.3511415525114157e-05, Loss: 0.3783872127532959
Epoch [1/1], Step [5901/10950], lr: 2.3054794520547945e-05, Loss: 0.6361090540885925
Epoch [1/1], Step [6001/10950], lr: 2.2598173515981734e-05, Loss: 0.3001135289669037
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [6001/10950], Val_loss: 1.0501
{'exact_match': 63.065279091769156, 'f1': 83.52099859449268}
Epoch [1/1], Step [6101/10950], lr: 2.2141552511415526e-05, Loss: 0.8243734240531921
Epoch [1/1], Step [6201/10950], lr: 2.1684931506849314e-05, Loss: 0.5458135008811951
Epoch [1/1], Step [6301/10950], lr: 2.1228310502283106e-05, Loss: 0.3901432752609253
Epoch [1/1], Step [6401/10950], lr: 2.0771689497716895e-05, Loss: 0.44270068407058716
Epoch [1/1], Step [6501/10950], lr: 2.0315068493150687e-05, Loss: 0.500744640827179
Epoch [1/1], Step [6601/10950], lr: 1.9858447488584475e-05, Loss: 0.2918775975704193
Epoch [1/1], Step [6701/10950], lr: 1.9401826484018264e-05, Loss: 0.420117050409317
Epoch [1/1], Step [6801/10950], lr: 1.8945205479452056e-05, Loss: 0.5423960089683533
Epoch [1/1], Step [6901/10950], lr: 1.8488584474885845e-05, Loss: 0.47634434700012207
Epoch [1/1], Step [7001/10950], lr: 1.8031963470319637e-05, Loss: 0.4501594305038452
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

Epoch [1/1], Step [7001/10950], Val_loss: 0.9227
{'exact_match': 63.4720908230842, 'f1': 83.71084679853041}
Epoch [1/1], Step [7101/10950], lr: 1.7575342465753425e-05, Loss: 0.7024349570274353
Epoch [1/1], Step [7201/10950], lr: 1.7118721461187214e-05, Loss: 0.35848650336265564
Epoch [1/1], Step [7301/10950], lr: 1.6662100456621006e-05, Loss: 0.5253626108169556
Epoch [1/1], Step [7401/10950], lr: 1.6205479452054794e-05, Loss: 0.5773834586143494
Epoch [1/1], Step [7501/10950], lr: 1.5748858447488586e-05, Loss: 0.2960149943828583
Epoch [1/1], Step [7601/10950], lr: 1.5292237442922375e-05, Loss: 0.38494160771369934
Epoch [1/1], Step [7701/10950], lr: 1.4835616438356165e-05, Loss: 1.1811506748199463
Epoch [1/1], Step [7801/10950], lr: 1.4378995433789955e-05, Loss: 0.2531850039958954
Epoch [1/1], Step [7901/10950], lr: 1.3922374429223744e-05, Loss: 0.21467162668704987
Epoch [1/1], Step [8001/10950], lr: 1.3465753424657534e-05, Loss: 0.3838340640068054
Running Evaluation


  0%|          | 0/1322 [00:00<?, ?it/s]

In [None]:
evaluate_squad(model)

In [None]:
# Training using HuggingFace Trainer class.
model_id="google/flan-t5-small"
dataset_id="squad"

# Hugging Face repository id
repository_id = f"{model_id.split('/')[1]}-{dataset_id}"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=repository_id,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=5e-5,
    num_train_epochs=1,
    max_steps=5000,
    # logging & evaluation strategies
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=5,
    evaluation_strategy="no",
    save_strategy="no",
    save_total_limit=2,
    load_best_model_at_end=True,
    # report_to="wandb",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"]
)
trainer.train()