<a href="https://colab.research.google.com/github/Servat0r/HLT-Project-2023/blob/master/LMQG_Squad_6000_examples_MT5_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

An example of QG finetuned T5 model (`t5-base`) over a variant of SquaD V1 dataset for Question Generation.

### 1. Preliminaries

#### Mounting and Installing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks"

/content/drive/MyDrive/Colab Notebooks


In [None]:
!pip install "transformers[sentencepiece]"
!pip install "transformers[torch]"
!pip install datasets
!pip install evaluate

Collecting transformers[sentencepiece]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/7.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:03[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/7.4 MB[0m [31m5.3 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m3.9/7.4 MB[0m [31m37.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.4/7.4 MB[0m [31m58.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[sentencepiece])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [9

In [None]:
NUM_EPOCHS = 12

#### Imports

In [None]:
from transformers import AutoModel, AutoTokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer, AdamW, DataCollatorWithPadding
from datasets import load_dataset, Dataset, load_from_disk, load_metric
import numpy as np
import evaluate
import torch
import os

In [None]:
model_checkpoint='google/mt5-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [None]:
%run utils.ipynb

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=82604dbbec8bb2ea060ec5e5de2d6c419804cea632fb1af2a9d6ac0e9c187215
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Updated!


NOTE: `T5` uses `Negative Log Likelihood` by default.

### 2. Dataset Loading and Preprocessing

We will use [`squad_it`](https://huggingface.co/datasets/squad_it), a variant of the `SquaD` dataset adapted to `Question Generation` tasks.

In [None]:
(train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset) = \
  load_and_preprocess_lmqg_squad_dataset_highlighting(shuffle_seed=42, train_select=6000, eval_select=4000, use_extra_ids=True)

### 3. Model Loading and configuration

#### Loading

In [None]:
model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#### Configuration

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
  num_training_steps = get_training_configuration(train_batch_size=2, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-4, num_epochs=NUM_EPOCHS)

36000




### 4. Fine-tuning

#### Execution

In [None]:
training_results_dict = main_training_loop(
    model, device, optimizer, train_dataloader, eval_dataloader,
    lr_scheduler, num_training_steps, num_epochs=NUM_EPOCHS, metrics=None,
    eval_strategy='epoch', eval_every=2000, model_save_path='lmqg_squad_highlighting_extra_ids_reduced_mt5base_test',
    early_stopping=True, early_stopping_patience=4,
  )

epoch_train_losses = training_results_dict['epoch_train_losses']
epoch_eval_losses = training_results_dict['epoch_eval_losses']
epoch_eval_metrics = training_results_dict['epoch_eval_metrics']
print(epoch_train_losses, epoch_eval_losses, epoch_eval_metrics, sep='\n')

  0%|          | 0/36000 [00:00<?, ?it/s]

  0%|          | 0/12000 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: Train Loss = 3.395646572113037, Eval Loss = 1.9384691715240479
Epoch 1: Train Loss = 1.2586390972137451, Eval Loss = 2.017392158508301
Epoch 2: Train Loss = 1.8892892599105835, Eval Loss = 1.9290187358856201
Epoch 3: Train Loss = 1.620028018951416, Eval Loss = 1.861751914024353
Epoch 4: Train Loss = 0.9800779223442078, Eval Loss = 1.853191614151001
Epoch 5: Train Loss = 2.182864189147949, Eval Loss = 1.9483317136764526


In [None]:
model.save_pretrained('lmqg_squad_highlighting_extra_ids_reduced_mt5base_example')

### 5. Analysis of the Results

### Calculating BLEU and ROUGE score

In [None]:
metrics = {
    'bleu': get_bleu_config(tokenizer),
    'nist_m': get_nist_config(tokenizer),
    'rouge': get_rouge_config(tokenizer),
}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
P = True

In [None]:
if P:
  model = MT5ForConditionalGeneration.from_pretrained('HLTProject/MT5 Base LMQG Squad Reduced Highlighting Extra IDS/lmqg_squad_highlighting_extra_ids_reduced_mt5base_test_epoch3', local_files_only=True)

In [None]:
if P:
  optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
    num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-3, num_epochs=2)

3000




In [None]:
if P:
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)
  print()




In [None]:
tokenized_test_dataset = tokenized_test_dataset.shuffle(seed=42)#.select(range(1000))
test_dataset = test_dataset.shuffle(seed=42)#.select(range(1000))

In [None]:
#tokenized_test_dataset = tokenized_test_dataset.remove_columns(['question'])

In [None]:
from tqdm.auto import tqdm
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
test_loss_tracker, test_metrics_tracker, num_test_steps = [], [], len(test_dataloader)
test_progress_bar = tqdm(range(num_test_steps))
test_loss = evaluation_loop(
    model, device, optimizer, test_dataloader, lr_scheduler, test_loss_tracker, test_metrics_tracker, metrics, test_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/1485 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  labels_batch = torch.tensor(batch['labels'])


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics = {'bleu': {'bleu': 0.19967646183684398, 'precisions': [0.5229632123646216, 0.26113267060053513, 0.16329501232807533, 0.10581066523027202], 'brevity_penalty': 0.905979131019544, 'length_ratio': 0.9101342477746972, 'translation_length': 124743, 'reference_length': 137060}, 'nist_m': {'nist_mt': 5.5543170363436944}, 'rouge': {'rouge1': 0.48816805565936566, 'rouge2': 0.27165873081875247, 'rougeL': 0.45393792787236975, 'rougeLsum': 0.45390865648441286}}


In [None]:
import json
with open('lmqg_squad_highlighting_extra_ids_reduced_mt5_base_epoch11_test_results.json', 'w') as out_file:
  json.dump({'loss': test_loss_tracker, 'metrics': test_metrics_tracker}, out_file)

NameError: ignored

In [None]:
test_dataset[18:30]['question']

["How many times was the release date for Kanye's first album pushed back?",
 'How much money did American Idol generate from ads in its seventh season?',
 'What do many characterize the new perspective as being more informed with?',
 'How many nocturnes did Chopin compose?',
 'According to Dibben, what are the benefits of applying process metaphysics to examining management and business administration as a component of social science?',
 'With which of his friends did Schwarzenegger start a bricklaying company?',
 'How many people watched the season 14 finale?',
 'Which magazine did Beyonce pose on the cover for in August of 2015?',
 'What kind of operation did this massing of aircraft produce?',
 'What stadium do the New York Jets call home?',
 'Where did early Iranian people establish societies?',
 'With the right tools, what area could get free BBC broadcasts from Astra 2D?']

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][18:30].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

['How many times did The College Dropout have its release postponed?',
 'How much income did American Idol earn in 2004?',
 'What is the new perspective of environmental anthropology more informed with?',
 'How many Chopin nocturnes are more structured than Field?',
 'What is the purpose of applied process thought for Dibben?',
 'Who started a bricklaying business?',
 'How many viewers attended the fourteenth season finale?',
 'What magazine released the cover of the September issue?',
 "What type of operation was the largest in People's Liberation Army history?",
 'What is the name of the stadium where the New York Jets play their home games?',
 'Where did the various Iranian tribes settled?',
 'Where did BBC channels "free-to-air" be broadcasted?']

In [None]:
bleu = load('bleu')

In [None]:
bleu.compute(predictions=final_predictions, references=test_dataset['question'][18:30])

{'bleu': 0.08648524256791038,
 'precisions': [0.484375,
  0.1724137931034483,
  0.0673076923076923,
  0.021739130434782608],
 'brevity_penalty': 0.8225775623986646,
 'length_ratio': 0.8366013071895425,
 'translation_length': 128,
 'reference_length': 153}

In [None]:
bert_score = load('bertscore')

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][18:30], lang='en')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.9258518218994141,
  0.9532108306884766,
  0.9330285787582397,
  0.9159680604934692,
  0.9182608127593994,
  0.9515888690948486,
  0.952564001083374,
  0.9283753037452698,
  0.9060604572296143,
  0.9104849100112915,
  0.9456143975257874,
  0.8807752132415771],
 'recall': [0.9313098192214966,
  0.9263368844985962,
  0.925735592842102,
  0.9452849626541138,
  0.8769401907920837,
  0.9000390768051147,
  0.9603040814399719,
  0.9009914994239807,
  0.8934392929077148,
  0.9344304800033569,
  0.9393361210823059,
  0.8587230443954468],
 'f1': [0.9285728335380554,
  0.9395817518234253,
  0.9293677806854248,
  0.9303956031799316,
  0.8971249461174011,
  0.9250963926315308,
  0.9564184546470642,
  0.9144784808158875,
  0.8997055888175964,
  0.9223023056983948,
  0.9424647688865662,
  0.8696093559265137],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.31.0)'}

In [None]:
test_dataset[108:120]['question']

['When was slavery completely outlawed in the state of New York?',
 'The idea that people are unchanging and stay the same even through changes is considered what?',
 'What can be worn in cold winter weather?',
 'Where did Chopin debut after completing his studies?',
 "Who was Alfonso III's third son and what area did he receive?",
 'What river was adjacent to HMNB Davenport?',
 'In what city was the 1966 NABBA Mr. Universe competition held?',
 "How much did Chopin's funeral cost?",
 'Afonso heard Jesus promising what?',
 'What was the last year that a republican candidate won all four boroughs of NYC?',
 'Who used the the Ordos region as a place to stage raids?',
 'How large was the number of injured in Beichuan County?']

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][108:120].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

['When was slavery completely abolished in the state?',
 'What is a thing or person often seen as having a core identity?',
 'What are authorized for winter wear in cold climates?',
 'Where did Chopin make his debut?',
 'What was the name of the third king?',
 'Where was the first dockyard opened?',
 'Where did Schwarzenegger attend the NABBA Mr. Universe competition?',
 'What was the amount of the funeral and monument in Warsaw?',
 'What did Jesus promise to the Portuguese?',
 'When did President Calvin Coolidge win the five boroughs?',
 'How did the Ordos region become a rallying base to stage raids into Ming China?',
 'How many people were injured in Beichuan?']

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][108:120], lang='en')['f1']

[0.9682797789573669,
 0.8558161854743958,
 0.9397054314613342,
 0.9497520923614502,
 0.8970755934715271,
 0.8778442740440369,
 0.9385701417922974,
 0.8913483619689941,
 0.8766393661499023,
 0.8915377855300903,
 0.9244576096534729,
 0.9494876861572266]

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][108:120], lang='en')['precision']

[0.9807339310646057,
 0.8681104183197021,
 0.9352701902389526,
 0.9642594456672668,
 0.9130776524543762,
 0.8948466181755066,
 0.9455004930496216,
 0.8970117568969727,
 0.890372633934021,
 0.9002717733383179,
 0.9142327308654785,
 0.956504225730896]

In [None]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_bos_token',
 '_call_one',
 '_cls_token',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_extra_ids',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_mask_token

In [None]:
tokenizer.vocab_size

250100

In [None]:
help(tokenizer.add_special_tokens)

Help on method add_special_tokens in module transformers.tokenization_utils_base:

add_special_tokens(special_tokens_dict: Dict[str, Union[str, tokenizers.AddedToken]], replace_additional_special_tokens=True) -> int method of transformers.models.t5.tokenization_t5_fast.T5TokenizerFast instance
    Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
    special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
    current vocabulary).
    
    Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
    matrix of the model so that its embedding matrix matches the tokenizer.
    
    In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
    
    Using `add_special_tokens` will ensure your special tokens can be used in several ways:
    
    - Special tokens are carefully handled by the tokenizer (

In [None]:
tokenizer.additional_special_tokens

[]

In [None]:
tokenizer.add_special_tokens({'additional_special_tokens': ['<answer>', '<context>', '<hl>']})

3

In [None]:
tokenizer.additional_special_tokens

['<answer>', '<context>', '<hl>']

In [None]:
len(tokenizer)

250103

In [None]:
test_dataset[990:1000]['question']

['When was Northern Rock taken into public hands?',
 'How many people work in the New York publishing industry?',
 'Until when did the Portuguese government resist decolonization of their overseas territories?',
 'Who gave Chopin a loan in September for an apartment?',
 'What was the U.S. unemployment rate in October 2009?',
 "Which team did Notre Dame's football team find inspiration from?",
 'What countries used comprehensive schools extensively?',
 'Who was the mayor of Nagano?',
 'Who did Ü-Tsang king have an alliance with?',
 'At what age did Kanye West relocate to China?']

In [None]:
model.eval()
with torch.no_grad():
  predictions = model.generate(tokenized_test_dataset['input_ids'][990:1000].to(device), max_length=64, num_beams=10)

In [None]:
tokenizer.batch_decode(predictions, skip_special_tokens=True)

['When was Northern Rock taken into public hands?',
 'How many people employ the publishing industry?',
 'When did the CARNATION Revolution end?',
 "Who supported Chopin's loan?",
 'What was the U.S. unemployment rate in 2009?',
 'What team brought football to Notre Dame in 1887?',
 'Where is the term comprehensive school commonly used?',
 'Who was the Mayor of Nagano?',
 'Where was the Ü-Tsang king allied with?',
 'When did West move to Nanjing?']

In [None]:
from transformers import BertModel
bert_model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenized_input = bert_tokenizer('hello', return_tensors='pt')
tokenized_input

{'input_ids': tensor([[ 101, 7592,  102]]), 'token_type_ids': tensor([[0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1]])}

In [None]:
bert_model(**tokenized_input).last_hidden_state.shape

torch.Size([1, 3, 768])