<a href="https://colab.research.google.com/github/Servat0r/HLT-Project-2023/blob/master/LMQG_Squad_10000_examples_MT5_base_with_Teacher_Forcing%2C_Support_Classifier_Top_1_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

An example of QG finetuned T5 model (`t5-base`) over a variant of SquaD V1 dataset for Question Generation.

### 1. Preliminaries

#### Mounting and Installing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/Colab Notebooks"

/content/drive/MyDrive/Colab Notebooks


In [3]:
%run LMQGSquadSentenceTypeUtils.ipynb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K 

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [4]:
NUM_EPOCHS = 10

#### Imports

In [5]:
from transformers import AutoModel, AutoTokenizer, MT5ForConditionalGeneration, MT5ForSequenceClassification, AdamW, DataCollatorWithPadding
from datasets import load_dataset, Dataset, load_from_disk, load_metric
import numpy as np
import evaluate
import torch
import os

In [6]:
model_checkpoint='google/mt5-base'

In [7]:
auxiliary_model_path = 'lmqg_squad_reduced_mt5base_sentence_classification_(epoch 3)'

In [8]:
train_dataset_path = 'lmqg_squad_top1accuracy_classification_10k_examples_train'
validation_dataset_path = 'lmqg_squad_top1accuracy_classification_10k_examples_train'
test_dataset_path = 'lmqg_squad_top1accuracy_classification_10k_examples_train'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
tokenizer.add_special_tokens({'sep_token': '<sep>', 'additional_special_tokens': ['<answer>', '<context>']})

3

In [11]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [12]:
def get_inputs(dataset, device, start, end):
  batch = dataset[start:end] if start + 1 < end else [dataset[start]]
  return {key: torch.stack([v.to(device) for v in values]) for key, values in batch.items()}

### 2. Dataset Loading and Preprocessing

We will use [`squad_it`](https://huggingface.co/datasets/squad_it), a variant of the `SquaD` dataset adapted to `Question Generation` tasks.

In [13]:
def prompt_conversion_function_eval_test(samples, model, tokenizer, input_column='sentence_answer', prompt_column='answer_context', input_prompt = 'generate questions'):
  sentences = samples[input_column]
  tokenized_sentences = tokenizer(samples[input_column], padding=True, return_tensors='pt')
  tokenized_sentences = {k: v.to(device) for k, v in tokenized_sentences.items()}
  model.eval()
  with torch.no_grad():
    outputs = model(**tokenized_sentences)
  label_ids = torch.argmax(torch.softmax(outputs.logits, dim=-1), dim=-1)
  labels = [_CLASSES[index.item()] for index in label_ids]
  hints = []
  for index, label in zip(label_ids, labels):
    if label.startswith('<prep'):
      hint = f' that start with a preposition and {_BASE_CONJ[index]}'
    elif label in _BASE_CONJ:
      hint = f' that start with {label[0].upper()}{label[1:]}'
    elif label == '<how-adv>':
      hint = f' that start with "How" and an adverb or an adjective'
    elif label == 'how':
      hint = f' that start with "How"'
    elif label == '<verb>':
      hint = f' that start with a verb'
    else:
      hint = ''
    hints.append(hint)
  return {prompt_column: [sample.replace(input_prompt, f"{input_prompt}{hint}") for sample in samples[prompt_column]]}

In [14]:
def prompt_conversion_function_train(samples, tokenizer, input_column='sentence_answer', prompt_column='answer_context', input_prompt = 'generate questions'):
  sentences = samples[input_column]
  hints = []
  for sample in samples:
    _, index = classify_example(sample)
    label = _CLASSES[index]
    if label.startswith('<prep'):
      hint = f' that start with a preposition and {_BASE_CONJ[index]}'
    elif label in _BASE_CONJ:
      hint = f' that start with {label[0].upper()}{label[1:]}'
    elif label == '<how-adv>':
      hint = f' that start with "How" and an adverb or an adjective'
    elif label == 'how':
      hint = f' that start with "How"'
    elif label == '<verb>':
      hint = f' that start with a verb'
    else:
      hint = ''
    hints.append(hint)
  return {prompt_column: [sample.replace(input_prompt, f"{input_prompt}{hint}") for sample in samples[prompt_column]]}

In [15]:
def load_and_preprocess_lmqg_squad_auxiliary_classifier_top1accuracy(
    auxiliary_model, tokenizer, dataset_name='lmqg/qg_squad', train_dataset_name='lmqg_squad_train_with_starting_preposition',
    eval_dataset_name='lmqg_squad_eval_with_starting_preposition',
    test_dataset_name='lmqg_squad_test_with_starting_preposition',
    train_select=None, eval_select=None
):
  dataset_loading_result = get_dataset(dataset_name, train_dataset_name, eval_dataset_name, test_dataset_name)
  local = dataset_loading_result['local']
  if local:
    train_dataset = dataset_loading_result['train']
    validation_dataset = dataset_loading_result['eval']
    test_dataset = dataset_loading_result['test']
  if not local:
    datasets = load_dataset(dataset_name)
    print(datasets['test'])
    train_dataset = datasets['train'].remove_columns(['paragraph_question', 'sentence', 'paragraph', 'paragraph_sentence'])
    validation_dataset = datasets['validation'].remove_columns(['paragraph_question', 'sentence', 'paragraph', 'paragraph_sentence'])
    test_dataset = datasets['test'].remove_columns(['paragraph_question', 'sentence', 'paragraph', 'paragraph_sentence'])
    print(f"Train dataset has {len(train_dataset)} items. Validation dataset has {len(validation_dataset)} items.")

    train_dataset.save_to_disk(train_dataset_name)
    validation_dataset.save_to_disk(eval_dataset_name)
    test_dataset.save_to_disk(test_dataset_name)

  if train_select:
    train_dataset = train_dataset.shuffle(seed=0).select(range(train_select))
  if eval_select:
    validation_dataset = validation_dataset.shuffle(seed=0).select(range(eval_select))

  build_train_feature = lambda sample: build_train_feature_lmqg_squad_highlighting(sample, use_extra_ids=True)
  train_dataset = train_dataset.map(build_train_feature).remove_columns(['answer', 'paragraph_answer'])
  validation_dataset = validation_dataset.map(build_train_feature).remove_columns(['answer', 'paragraph_answer'])
  test_dataset = test_dataset.map(build_train_feature).remove_columns(['answer', 'paragraph_answer'])

  train_prompt_converter = lambda samples: prompt_conversion_function_train(samples, auxiliary_model, tokenizer, input_prompt = 'generate questions')
  eval_test_prompt_converter = lambda samples: prompt_conversion_function_eval_test(samples, auxiliary_model, tokenizer, input_prompt = 'generate questions')
  train_dataset = train_dataset.map(prompt_converter, batched=True, batch_size=64).remove_columns(['sentence_answer'])
  validation_dataset = validation_dataset.map(prompt_converter, batched=True, batch_size=64).remove_columns(['sentence_answer'])
  test_dataset = test_dataset.map(prompt_converter, batched=True, batch_size=64).remove_columns(['sentence_answer'])

  tokenizer_function_lambda = lambda sample: tokenizer_function(sample, train_dataset=train_dataset)
  tokenized_train_dataset = train_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_validation_dataset = validation_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_test_dataset = test_dataset.map(lambda samples: tokenizer_function(samples, input_ids_padding="max_length"), batched=True).remove_columns(['answer_context', 'question'])

  tokenized_train_dataset.set_format("torch")
  tokenized_validation_dataset.set_format("torch")
  tokenized_test_dataset.set_format("torch")

  return (train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset)

In [16]:
USE_CACHE = os.path.exists(train_dataset_path)

In [17]:
if not USE_CACHE:
  auxiliary_model = MT5ForSequenceClassification.from_pretrained(auxiliary_model_path)
  auxiliary_model.to(device)
  (train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset) = \
    load_and_preprocess_lmqg_squad_auxiliary_classifier_top1accuracy(auxiliary_model, tokenizer, train_select=10000, eval_select=5000)
  del auxiliary_model

In [18]:
if USE_CACHE:
  train_dataset = load_from_disk('lmqg_squad_top1accuracy_classification_10k_examples_train').shuffle(seed=0).select(range(10000))
  validation_dataset = load_from_disk('lmqg_squad_top1accuracy_classification_10k_examples_eval').shuffle(seed=0).select(range(5000))
  test_dataset = load_from_disk('lmqg_squad_top1accuracy_classification_10k_examples_test').shuffle(seed=0)

  tokenizer_function_lambda = lambda sample: tokenizer_function(sample, train_dataset=train_dataset)
  tokenized_train_dataset = train_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_validation_dataset = validation_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_test_dataset = test_dataset.map(lambda samples: tokenizer_function(samples, input_ids_padding="max_length"), batched=True).remove_columns(['answer_context', 'question'])

  tokenized_train_dataset.set_format("torch")
  tokenized_validation_dataset.set_format("torch")
  tokenized_test_dataset.set_format("torch")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11877 [00:00<?, ? examples/s]

In [19]:
if not USE_CACHE:
  train_dataset.save_to_disk('lmqg_squad_top1accuracy_classification_10k_examples_train')
  validation_dataset.save_to_disk('lmqg_squad_top1accuracy_classification_10k_examples_eval')
  test_dataset.save_to_disk('lmqg_squad_top1accuracy_classification_10k_examples_test')

### 3. Model Loading and configuration

#### Loading

In [None]:
model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#### Configuration

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
  num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-4, num_epochs=NUM_EPOCHS)

25000




### 4. Fine-tuning

#### Execution

In [None]:
training_results_dict = main_training_loop(
    model, device, optimizer, train_dataloader, eval_dataloader,
    lr_scheduler, num_training_steps, num_epochs=NUM_EPOCHS, metrics=None,
    eval_strategy='epoch', eval_every=2000, model_save_path='lmqg_squad_reduced_mt5base_top1accuracy_classifier_teacher_forcing_10k_examples_test',
    early_stopping=False, early_stopping_patience=4, tokenizer=tokenizer,
    num_beams=4, tokenize_predictions_output=False, start_epoch=0
  )

epoch_train_losses = training_results_dict['epoch_train_losses']
epoch_eval_losses = training_results_dict['epoch_eval_losses']
epoch_eval_metrics = training_results_dict['epoch_eval_metrics']
print(epoch_train_losses, epoch_eval_losses, epoch_eval_metrics, sep='\n')

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: Train Loss = 1.708046317100525, Eval Loss = 1.8394631147384644
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 1: Train Loss = 1.9004101753234863, Eval Loss = 1.699291706085205
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 2: Train Loss = 1.17681086063385, Eval Loss = 1.6811448335647583
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 3: Train Loss = 1.7901214361190796, Eval Loss = 1.6933484077453613
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 4: Train Loss = 1.368942379951477, Eval Loss = 1.66969633102417
Save this model (y/n)?> y
Continue training (y/n)?> y
Epoch 5: Train Loss = 1.7442035675048828, Eval Loss = 1.6491166353225708
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 6: Train Loss = 1.2553006410598755, Eval Loss = 1.7896671295166016
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 7: Train Loss = 0.897524356842041, Eval Loss = 1.6877700090408325
Save this model (y/n)?> y
Continue training (y

In [None]:
model.save_pretrained('lmqg_squad_reduced_mt5base_top1accuracy_classifier_teacher_forcing_10k_examples_test_final')

In [None]:
from time import sleep
sleep(300)

### 5. Analysis of the Results

### Calculating BLEU and ROUGE score

In [None]:
metrics = {
    'bleu': get_bleu_config(tokenizer),
    'nist_m': get_nist_config(tokenizer),
    'rouge': get_rouge_config(tokenizer),
}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [20]:
P = True

In [22]:
if P:
  model = MT5ForConditionalGeneration.from_pretrained('lmqg_squad_reduced_mt5base_top1accuracy_classifier_teacher_forcing_10k_examples_test_final', local_files_only=True)

In [23]:
if P:
  optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
    num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-3, num_epochs=2)

5000




In [24]:
if P:
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)
  print()




In [25]:
tokenized_test_dataset = tokenized_test_dataset.shuffle(seed=42)#.select(range(1000))
test_dataset = test_dataset.shuffle(seed=42)#.select(range(1000))

In [26]:
def evaluation_loop(
    model, device, optimizer, eval_dataloader, lr_scheduler,
    loss_tracker, metrics_tracker=None, metrics=None, progress_bar=None,
    tokenizer=None, num_beams=1, top_k=None, top_p=None, num_candidates=4,
    score_function=bertscore_f1based_score, tokenize_predictions_output=True,
):
    model.eval()
    current_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'question' in batch:
          text_references = batch['question']
        else:
          labels_batch = torch.tensor(batch['labels'])
          labels_batch[labels_batch == -100] = tokenizer.pad_token_id
          text_references = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
        with torch.no_grad():
            outputs = model(**batch)
        current_loss = outputs.loss.item()
        if metrics:
          predictions = select_best_output(
              model, tokenizer, batch['input_ids'], text_references, score_function, max_length=100, num_beams=num_beams,
              top_k=top_k, top_p=top_p, num_candidates=num_candidates, verbose=False, tokenize_output=tokenize_predictions_output,
          )
          for metric_name, (metric, conversion_function_predictions, conversion_function_references) in metrics.items():
            conversion_function_predictions = std_conversion_predictions if conversion_function_predictions == 'text' else conversion_function_predictions
            converted_predictions = conversion_function_predictions(predictions) if tokenize_predictions_output else predictions
            references = text_references if conversion_function_references == 'text' else conversion_function_references(batch["labels"])
            metric.add_batch(predictions=converted_predictions, references=references)
        if progress_bar:
          progress_bar.update(1)
    loss_tracker.append(current_loss)
    if metrics:
      metrics_tracker.append({
        metric_name: metric.compute() for metric_name, (metric, _, _) in metrics.items()
      })
      print(f"Metrics = {metrics_tracker[-1]}")
    return current_loss


In [None]:
from tqdm.auto import tqdm
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
test_loss_tracker, test_metrics_tracker, num_test_steps = [], [], len(test_dataloader)
test_progress_bar = tqdm(range(num_test_steps))
test_loss = evaluation_loop(
    model, device, optimizer, test_dataloader, lr_scheduler, test_loss_tracker, test_metrics_tracker, metrics, test_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/1485 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  labels_batch = torch.tensor(batch['labels'])


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics = {'bleu': {'bleu': 0.22260171595492143, 'precisions': [0.5207011698559434, 0.2675960973699667, 0.16969116677355542, 0.11219265405241323], 'brevity_penalty': 0.9808565830299795, 'length_ratio': 0.9810375018240187, 'translation_length': 134461, 'reference_length': 137060}, 'nist_m': {'nist_mt': 5.821020082085196}, 'rouge': {'rouge1': 0.510751513568936, 'rouge2': 0.29226060323889014, 'rougeL': 0.4733808878684328, 'rougeLsum': 0.47347359498897035}}


In [None]:
test_loss

1.556106686592102

In [None]:
from tqdm.auto import tqdm
eval_dataloader = DataLoader(tokenized_validation_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
eval_loss_tracker, eval_metrics_tracker, num_eval_steps = [], [], len(eval_dataloader)
eval_progress_bar = tqdm(range(num_eval_steps))
eval_loss = evaluation_loop(
    model, device, optimizer, eval_dataloader, lr_scheduler, eval_loss_tracker, eval_metrics_tracker, metrics, eval_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/625 [00:00<?, ?it/s]

  labels_batch = torch.tensor(batch['labels'])


Metrics = {'bleu': {'bleu': 0.22312958262257623, 'precisions': [0.5152730478676508, 0.2644310060472787, 0.16682404784387786, 0.10904861486407105], 'brevity_penalty': 1.0, 'length_ratio': 1.0014022089118166, 'translation_length': 57847, 'reference_length': 57766}, 'nist_m': {'nist_mt': 5.487622642593566}, 'rouge': {'rouge1': 0.5143277280496752, 'rouge2': 0.29686533147456196, 'rougeL': 0.48064527790189104, 'rougeLsum': 0.4804728768751554}}


In [None]:
test_dataset[18:30]['question']

['What are Unglazed transpired collectors?',
 'What kind of genetic material can be produced from retrotransposons?',
 'What did European regulators introduce to increase the oversight of banks?',
 'What has the ASA identified as being ethically dangerous?',
 'What type of working dog may have been the reason so many humans were able to get into North America 12,000 tears ago?',
 'What was the name of the junior fashions launched in 2009 by Beyoncé and her mother?',
 'When did she receive the Legend Award?',
 'What was the lifespan of Theodore Hesburgh?',
 "What event caused Beyonce's depression?",
 'How many warships does the Royal Canadian Navy have?',
 'Where are more moist areas found in southern Europe?',
 'Who provided funds to encourage lending and restore faith in commercial banks in the aftermath of the financial crisis of 2007?']

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][18:30].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

['What are unglazed transpired collectors?',
 'Where can retrotransposons be transcribed?',
 'What regulations did European regulators introduced for bankers?',
 'What does the Association of Social Anthropologists of the UK and Commonwealth call ethically dangerous?',
 'What animal did one writer suggest that could have been critical to the success of the waves that entered North America roughly 12,000 years ago?',
 'What was the name of the new apparel label that was launched in July 2009?',
 'When did Beyoncé receive the Legend Award?',
 'When did he serve as president?',
 'Why did Beyoncé experience depression?',
 'How many warships does the Royal Canadian Navy have?',
 'Where is the wetter Atlantic climate?',
 'Who provided funds to encourage lending and restore faith in the commercial paper market?']

In [None]:
bleu = load('bleu')

In [None]:
bleu.compute(predictions=final_predictions, references=test_dataset['question'][18:30])

{'bleu': 0.28317125088786016,
 'precisions': [0.5681818181818182,
  0.3416666666666667,
  0.24074074074074073,
  0.19791666666666666],
 'brevity_penalty': 0.9131007162822622,
 'length_ratio': 0.9166666666666666,
 'translation_length': 132,
 'reference_length': 144}

In [None]:
bert_score = load('bertscore')

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][18:30], model_type='xlm-roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

{'precision': [0.9389117956161499,
  0.9027999639511108,
  0.9371086955070496,
  0.845855176448822,
  0.8647290468215942,
  0.9317687749862671,
  0.9204198122024536,
  0.8625965714454651,
  0.857804536819458,
  1.0,
  0.8927799463272095,
  0.9730311632156372],
 'recall': [0.9359325170516968,
  0.8740953207015991,
  0.9125221967697144,
  0.8940292000770569,
  0.8745402097702026,
  0.9026311635971069,
  0.9661250710487366,
  0.8288090825080872,
  0.8518611192703247,
  1.0,
  0.8739440441131592,
  0.9319801330566406],
 'f1': [0.9374197721481323,
  0.8882158398628235,
  0.9246520400047302,
  0.8692752718925476,
  0.8696069121360779,
  0.916968584060669,
  0.9427188038825989,
  0.8453653454780579,
  0.8548225164413452,
  1.0,
  0.8832615613937378,
  0.9520633220672607],
 'hashcode': 'xlm-roberta-base_L9_no-idf_version=0.3.12(hug_trans=4.32.1)'}

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][18:30], lang='en')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.9408156871795654,
  0.9304813742637634,
  0.9558658003807068,
  0.8963238000869751,
  0.8985199928283691,
  0.9470832347869873,
  0.9732059240341187,
  0.8882074356079102,
  0.9408583641052246,
  1.0,
  0.9183763265609741,
  0.9671722054481506],
 'recall': [0.939441442489624,
  0.9051532745361328,
  0.9427969455718994,
  0.9273107647895813,
  0.905845582485199,
  0.9173965454101562,
  0.9837484955787659,
  0.8887611627578735,
  0.9279925227165222,
  1.0,
  0.9094449877738953,
  0.9446665048599243],
 'f1': [0.9401280283927917,
  0.9176425337791443,
  0.9492863416671753,
  0.9115540385246277,
  0.9021679162979126,
  0.9320035576820374,
  0.9784488081932068,
  0.8884842395782471,
  0.9343811273574829,
  1.0,
  0.9138888716697693,
  0.9557868242263794],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.32.1)'}

In [27]:
bertscore = compute_bert_score(
    test_dataset, tokenized_test_dataset, model, device, tokenizer, batch_size=16,
    lang='en', model_type=None, max_length=200, num_beams=4, num_candidates=1
)

  0%|          | 0/743 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
bertscore

{'precision': (0.9146499627782472, 0.009984129520092875),
 'recall': (0.9144216322200938, 0.009724759898288944),
 'f1': (0.9143411772849582, 0.009319757676384117)}

In [29]:
np.mean([v[0] for v in bertscore.values()])

0.9144709240944331