<a href="https://colab.research.google.com/github/Servat0r/HLT-Project-2023/blob/master/QuestionAugmentedSquaD_2000examples_FlatDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Test Settings:

1. **Model**: MT5-base;
2. **Dataset**: LMQG/QG-Squad;
3. **Examples**: $5000$;
4. **Other**: $2000$ from training set augmented to $10000$ with question augmentation (one example per pair $\langle$ answer, context, question $\rangle$)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks"

/content/drive/MyDrive/Colab Notebooks


In [None]:
%run AveragedCrossEntropy.ipynb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks
Collecting transformers[sentencepiece]
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers[sentencepiece])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[sentencepiece])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Updated!


In [None]:
from transformers import AutoModel, AutoTokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer, AdamW, DataCollatorWithPadding
from datasets import load_dataset, Dataset, load_from_disk, load_metric
import numpy as np
import evaluate
import torch
import os
from dotenv import load_dotenv

In [None]:
model_checkpoint='google/mt5-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
tokenizer.add_special_tokens({'sep_token': '<sep>', 'additional_special_tokens': ['<answer>', '<context>', '<hl>']})

4

In [None]:
MAX_INPUTS_LENGTH = 512
MAX_LABELS_LENGTH = 64
def tokenizer_function(
    samples, max_inputs_length=MAX_INPUTS_LENGTH, max_labels_length=MAX_LABELS_LENGTH,
    input_ids_padding=True, train_dataset=None, ignore_index_id=0, question_column='question',
):
  max_labels_length = max_labels_length if not train_dataset else get_maximum_labels_length(train_dataset)
  input_tokenized = tokenizer(samples['answer_context'], padding=input_ids_padding, max_length=max_inputs_length, truncation=True, return_tensors='pt')
  labels_tokenized = tokenizer(samples[question_column], padding="max_length", max_length=max_labels_length, truncation=True, return_tensors='pt')
  labels, masks = labels_tokenized['input_ids'], labels_tokenized['attention_mask']
  """
  argmin_masks = torch.argmin(masks, dim=-1)
  for index in range(len(argmin_masks)):
    if masks[index][argmin_masks[index]] == 0 and ignore_index_id != 0:
      labels[index][argmin_masks[index]:] = ignore_index_id
  """
  input_tokenized['labels'] = [label for label in labels]
  return input_tokenized

In [None]:
def get_maximum_inputs_length(dataset):
  tokenized_dataset_lengths = [len(tokenizer.tokenize(sample['answer_context'])) for sample in dataset]
  return max(tokenized_dataset_lengths)

In [None]:
def load_and_preprocess_question_augmented_squad(
  dataset_name='lmqg/qg_squad', train_dataset_name='lmqg_squad_augmented_train',
  eval_dataset_name='lmqg_squad_augmented_eval', test_dataset_name='lmqg_squad_augmented_test',
  train_select=None, eval_select=None, use_extra_ids=False, num_questions_per_example=5,
):
  dataset_loading_result = get_dataset(dataset_name, train_dataset_name, eval_dataset_name, test_dataset_name)
  local = dataset_loading_result['local']
  if local:
    train_dataset = dataset_loading_result['train']
    validation_dataset = dataset_loading_result['eval']
    test_dataset = dataset_loading_result['test']
  if not local:
    datasets = load_dataset(dataset_name)
    train_dataset = datasets['train'].remove_columns(['question'])
    validation_dataset = datasets['validation'].remove_columns(['question'])
    test_dataset = datasets['test'].remove_columns(['question'])
    print(f"Train dataset has {len(train_dataset)} items. Validation dataset has {len(validation_dataset)} items.")

    train_dataset.save_to_disk(train_dataset_name)
    validation_dataset.save_to_disk(eval_dataset_name)
    test_dataset.save_to_disk(test_dataset_name)

  if train_select:
    train_dataset = train_dataset.select(range(train_select))
  if eval_select:
    validation_dataset = validation_dataset.select(range(eval_select))

  print(train_dataset)
  answers, contexts, questions, question_repeated = [], [], [], []
  num_questions_to_take = min(len(train_dataset['questions'][0]), num_questions_per_example)
  for item in train_dataset:
    for i, question in enumerate(item['questions']):
      if i >= num_questions_to_take:
        break
      question_repeated.append(item['question'])
      answers.append(item['answer'])
      contexts.append(item['context'])
      questions.append(question)
  print(len(answers), len(questions), len(contexts))
  train_dataset = Dataset.from_dict({'answer': answers, 'questions': questions, 'context': contexts, 'question': question_repeated})

  build_train_feature_local = lambda sample: build_train_feature(sample, use_extra_ids=use_extra_ids, context_label='context')
  train_dataset = train_dataset.map(build_train_feature_local).remove_columns(['answer', 'context'])
  validation_dataset = validation_dataset.map(build_train_feature_local).remove_columns(['answer', 'context'])
  test_dataset = test_dataset.map(build_train_feature_local).remove_columns(['answer', 'context'])

  max_train_inputs_length = get_maximum_inputs_length(train_dataset)
  max_train_labels_length = get_maximum_labels_length(train_dataset)
  print(max_train_inputs_length, max_train_labels_length)
  train_tokenizer = lambda samples: tokenizer_function(samples, question_column='questions', input_ids_padding='max_length', max_inputs_length=max_train_inputs_length, max_labels_length=max_train_labels_length)
  tokenized_train_dataset = train_dataset.map(train_tokenizer, batched=False).remove_columns(['answer_context', 'question', 'questions'])
  tokenized_validation_dataset = validation_dataset.map(tokenizer_function, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_test_dataset = test_dataset.map(lambda samples: tokenizer_function(samples, input_ids_padding="max_length", max_labels_length=32), batched=True).remove_columns(['answer_context', 'question'])

  tokenized_train_dataset = tokenized_train_dataset.map(lambda sample: {'input_ids': sample['input_ids'][0], 'attention_mask': sample['attention_mask'][0], 'labels': sample['labels'][0]})
  tokenized_train_dataset.set_format("torch")
  tokenized_validation_dataset.set_format("torch")
  tokenized_test_dataset.set_format("torch")

  return (train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset)

In [None]:
(train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset) = \
  load_and_preprocess_question_augmented_squad(train_dataset_name='lmqg_squad_augmented_train_0_2000', num_questions_per_example=5, eval_select=2000)

Dataset({
    features: ['answer', 'question', 'context', 'questions'],
    num_rows: 2000
})
10000 10000 10000


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

683 44


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

### 3. Model Loading and configuration

#### Loading

In [None]:
model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 250104. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(250104, 768)

In [None]:
model

MT5ForConditionalGeneration(
  (shared): Embedding(250104, 768)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250104, 768)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
         

#### Configuration

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
NUM_EPOCHS = 10

In [None]:
optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
  num_training_steps = get_training_configuration(train_batch_size=2, eval_batch_size=2, tokenizer=tokenizer, learning_rate=1e-4, num_epochs=NUM_EPOCHS)

50000




### 4. Fine-tuning

#### Execution

In [None]:
tokenized_train_dataset['labels'].shape

torch.Size([10000, 44])

In [None]:
training_results_dict = main_training_loop(
    model, device, optimizer, train_dataloader, eval_dataloader,
    lr_scheduler, num_training_steps, num_epochs=NUM_EPOCHS, metrics=None,
    eval_strategy='epoch', eval_every=2000, model_save_path='lmqg_squad_augmented_mt5base_20epochs_2000examples',
    early_stopping=True, early_stopping_patience=10, tokenizer=tokenizer,
  )

epoch_train_losses = training_results_dict['epoch_train_losses']
epoch_eval_losses = training_results_dict['epoch_eval_losses']
epoch_eval_metrics = training_results_dict['epoch_eval_metrics']
print(epoch_train_losses, epoch_eval_losses, epoch_eval_metrics, sep='\n')

  0%|          | 0/50000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  labels_batch = torch.tensor(batch['labels'])


Epoch 0: Train Loss = 2.764678716659546, Eval Loss = 2.371234893798828
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 1: Train Loss = 1.7972885370254517, Eval Loss = 1.9653733968734741
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 2: Train Loss = 1.0632888078689575, Eval Loss = 1.5438541173934937
Save this model (y/n)?> y
Continue training (y/n)?> y


  labels_batch = torch.tensor(batch['labels'])


Epoch 3: Train Loss = 0.5043489933013916, Eval Loss = 0.34864509105682373
Save this model (y/n)?> y
Continue training (y/n)?> y


  labels_batch = torch.tensor(batch['labels'])


Epoch 4: Train Loss = 0.178693488240242, Eval Loss = 0.2782707214355469
Save this model (y/n)?> y
Continue training (y/n)?> y


  labels_batch = torch.tensor(batch['labels'])


Epoch 5: Train Loss = 0.31684792041778564, Eval Loss = 0.29162293672561646
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 6: Train Loss = 0.3581695854663849, Eval Loss = 0.3061820864677429
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 7: Train Loss = 0.40888357162475586, Eval Loss = 0.31296879053115845
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 8: Train Loss = 0.38944295048713684, Eval Loss = 0.31036922335624695
Save this model (y/n)?> n
Continue training (y/n)?> n
[2.764678716659546, 1.7972885370254517, 1.0632888078689575, 0.5043489933013916, 0.178693488240242, 0.31684792041778564, 0.3581695854663849, 0.40888357162475586, 0.38944295048713684]
[2.371234893798828, 1.9653733968734741, 1.5438541173934937, 0.34864509105682373, 0.2782707214355469, 0.29162293672561646, 0.3061820864677429, 0.31296879053115845, 0.31036922335624695]
[]


In [None]:
model.save_pretrained(f"squad_qg_augmented_allitems_mt5base_20epochsmax_2000examples_(epoch {training_results_dict['epoch']})")

In [None]:
from time import sleep
sleep(180)

### 5. Analysis of the Results

### Calculating BLEU and ROUGE score

In [None]:
loss_tracker=[]
metrics_tracker=[]
metrics = {
    'bleu': get_bleu_config(tokenizer),
    'nist': get_nist_config(tokenizer),
    'rouge': get_rouge_config(tokenizer),
}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
P = True

In [None]:
if P:
  model = MT5ForConditionalGeneration.from_pretrained('squad_qg_augmented_allitems_mt5base_20epochsmax_2000examples_(epoch 9)')

In [None]:
if P:
  optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
    num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-4, num_epochs=2)

5000




In [None]:
if P:
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)
  print()




In [None]:
test_dataset, tokenized_test_dataset = test_dataset.shuffle(seed=42), tokenized_test_dataset.shuffle(seed=42)

In [None]:
tokenized_test_dataset_reduced = tokenized_test_dataset.select(range(200))
test_dataset_reduced = test_dataset.select(range(200))

In [None]:
test_dataset_reduced['question'][0:4]

['What did Herr Gott, dich loben wir become known as ?',
 "What was the name of the Doctor Who play from the 1980's?",
 'What agreement was made for trade with natives and British?',
 'Who was Count of Melfi']

In [None]:
model.eval()
with torch.no_grad():
  predictions = model.generate(tokenized_test_dataset['input_ids'][0:4].to(device), max_length=64, num_beams=4)

In [None]:
tokenizer.batch_decode(predictions, skip_special_tokens=True)

['What was the name of the German Te Deum?',
 'What was the name of the play that Jon Pertwee and Colin Baker played in 1980?',
 'When did Christopher Gist conclude the Treaty of Logstown?',
 'Who was the elected leader of the Drengot family?']

In [None]:
from tqdm.auto import tqdm
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
test_loss_tracker, test_metrics_tracker, num_test_steps = [], [], len(test_dataloader)
test_progress_bar = tqdm(range(num_test_steps))
test_loss = evaluation_loop(
    model, device, optimizer, test_dataloader, lr_scheduler, test_loss_tracker, test_metrics_tracker, metrics, test_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/1322 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  labels_batch = torch.tensor(batch['labels'])


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics = {'bleu': {'bleu': 0.19702508079891198, 'precisions': [0.4884865313365153, 0.23393938303480663, 0.14342807718691283, 0.09193818945310331], 'brevity_penalty': 1.0, 'length_ratio': 1.0159494558322761, 'translation_length': 121727, 'reference_length': 119816}, 'nist': {'nist_mt': 5.213706552330381}, 'rouge': {'rouge1': 0.4842570182006559, 'rouge2': 0.2600564617600638, 'rougeL': 0.4479914385162102, 'rougeLsum': 0.4479831513431553}}
