<a href="https://colab.research.google.com/github/Servat0r/HLT-Project-2023/blob/master/LMQG_Squad_10000_examples_MT5_base_with_Teacher_Forcing%2C_Support_Classifier_Top_2_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

An example of QG finetuned T5 model (`t5-base`) over a variant of SquaD V1 dataset for Question Generation.

### 1. Preliminaries

#### Mounting and Installing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/Colab Notebooks"

/content/drive/MyDrive/Colab Notebooks


In [3]:
%run LMQGSquadSentenceTypeUtils.ipynb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [4]:
NUM_EPOCHS = 10

#### Imports

In [5]:
from transformers import AutoModel, AutoTokenizer, MT5ForConditionalGeneration, MT5ForSequenceClassification, AdamW, DataCollatorWithPadding
from datasets import load_dataset, Dataset, load_from_disk, load_metric
import numpy as np
import evaluate
import torch
import os

In [6]:
model_checkpoint='google/mt5-base'

In [7]:
auxiliary_model_path = 'lmqg_squad_reduced_mt5base_sentence_classification_(epoch 3)'

In [8]:
train_dataset_path = 'lmqg_squad_top2accuracy_classification_10k_examples_train'
validation_dataset_path = 'lmqg_squad_top2accuracy_classification_10k_examples_eval'
test_dataset_path = 'lmqg_squad_top2accuracy_classification_10k_examples_test'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
tokenizer.add_special_tokens({'sep_token': '<sep>', 'additional_special_tokens': ['<hl>']})

2

In [11]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [12]:
def get_inputs(dataset, device, start, end):
  batch = dataset[start:end] if start + 1 < end else [dataset[start]]
  return {key: torch.stack([v.to(device) for v in values]) for key, values in batch.items()}

### 2. Dataset Loading and Preprocessing

In [13]:
def prompt_conversion_function_eval_test(
    samples, model, tokenizer, input_column='sentence_answer', prompt_column='answer_context', input_prompt = 'generate questions', threshold=0.5
):
  sentences = samples[input_column]
  tokenized_sentences = tokenizer(samples[input_column], padding=True, return_tensors='pt')
  tokenized_sentences = {k: v.to(device) for k, v in tokenized_sentences.items()}
  model.eval()
  with torch.no_grad():
    outputs = model(**tokenized_sentences)
  softmaxed = torch.softmax(outputs.logits, dim=-1)
  first_label_ids = torch.argmax(softmaxed, dim=-1)
  first_labels = [_CLASSES[index.item()] for index in first_label_ids]
  first_probs = [softmaxed[i][label] for i, label in enumerate(first_labels_ids)]
  for i in range(len(softmaxed)):
    softmaxed[i][first_label_ids[i].item()] = 0
  second_label_ids = torch.argmax(softmaxed, dim=-1)
  second_labels = [_CLASSES[index.item()] for index in second_label_ids]
  second_probs = [softmaxed[i][label] for i, label in enumerate(second_labels_ids)]
  hints = []
  for first_index, first_label, first_prob, second_index, second_label, second_prob \
    in zip(first_label_ids, first_labels, first_probs, second_label_ids, second_labels, second_probs):
    no_suggestions = (first_index == second_index == len(_CLASSES)-1)
    both_suggestions = first_label != len(_CLASSES)-1 and second_label != len(_CLASSES)-1 and second_prob/first_prob >= threshold
    if no_suggestions:
      hints.append('')
      continue
    hint = 'that start either with ' if both_suggestions else 'that start with '
    for index, label in zip([first_index, second_index], [first_label, second_label]):
      if label.startswith('<prep'):
        hint = hint + f'a preposition and {_BASE_CONJ[index]}'
      elif label in _BASE_CONJ:
        hint = hint + f'{label[0].upper()}{label[1:]}'
      elif label == '<how-adv>':
        hint = hint + f'"How" and an adverb or an adjective'
      elif label == 'how':
        hint = hint + f'"How"'
      elif label == '<verb>':
        hint = hint + f'a verb'
      if index == first_index and both_suggestions:
        hint = hint + " or with "
    hints.append(hint)
  return {prompt_column: [sample.replace(input_prompt, f"{input_prompt}{hint}") for sample in samples[prompt_column]]}

In [14]:
def prompt_conversion_function_train(samples, tokenizer, input_column='sentence_answer', prompt_column='answer_context', input_prompt = 'generate questions'):
  hints = []
  for sample, question in zip(samples[input_column], samples['question']):
    _, index = classify_example(sample, question_text=question)
    label = _CLASSES[index]
    if label.startswith('<prep'):
      hint = f' that start with a preposition and {_BASE_CONJ[index]}'
    elif label in _BASE_CONJ:
      hint = f' that start with {label[0].upper()}{label[1:]}'
    elif label == '<how-adv>':
      hint = f' that start with "How" and an adverb or an adjective'
    elif label == 'how':
      hint = f' that start with "How"'
    elif label == '<verb>':
      hint = f' that start with a verb'
    else:
      hint = ''
    hints.append(hint)
  return {prompt_column: [sample.replace(input_prompt, f"{input_prompt}{hint}") for sample in samples[prompt_column]]}

In [15]:
def load_and_preprocess_lmqg_squad_auxiliary_classifier_top1accuracy(
    auxiliary_model, tokenizer, dataset_name='lmqg/qg_squad', train_dataset_name='lmqg_squad_train_with_starting_preposition',
    eval_dataset_name='lmqg_squad_eval_with_starting_preposition',
    test_dataset_name='lmqg_squad_test_with_starting_preposition',
    train_select=None, eval_select=None, threshold=0.5,
):
  dataset_loading_result = get_dataset(dataset_name, train_dataset_name, eval_dataset_name, test_dataset_name)
  local = dataset_loading_result['local']
  if local:
    train_dataset = dataset_loading_result['train']
    validation_dataset = dataset_loading_result['eval']
    test_dataset = dataset_loading_result['test']
  if not local:
    datasets = load_dataset(dataset_name)
    print(datasets['test'])
    train_dataset = datasets['train'].remove_columns(['paragraph_question', 'sentence', 'paragraph', 'paragraph_sentence'])
    validation_dataset = datasets['validation'].remove_columns(['paragraph_question', 'sentence', 'paragraph', 'paragraph_sentence'])
    test_dataset = datasets['test'].remove_columns(['paragraph_question', 'sentence', 'paragraph', 'paragraph_sentence'])
    print(f"Train dataset has {len(train_dataset)} items. Validation dataset has {len(validation_dataset)} items.")

    train_dataset.save_to_disk(train_dataset_name)
    validation_dataset.save_to_disk(eval_dataset_name)
    test_dataset.save_to_disk(test_dataset_name)

  if train_select:
    train_dataset = train_dataset.shuffle(seed=0).select(range(train_select))
  if eval_select:
    validation_dataset = validation_dataset.shuffle(seed=0).select(range(eval_select))

  build_train_feature = lambda sample: build_train_feature_lmqg_squad_highlighting(sample, use_extra_ids=True)
  train_dataset = train_dataset.map(build_train_feature).remove_columns(['answer', 'paragraph_answer'])
  validation_dataset = validation_dataset.map(build_train_feature).remove_columns(['answer', 'paragraph_answer'])
  test_dataset = test_dataset.map(build_train_feature).remove_columns(['answer', 'paragraph_answer'])

  train_prompt_converter = lambda samples: prompt_conversion_function_train(samples, tokenizer, input_prompt = 'generate questions')
  eval_test_prompt_converter = lambda samples: prompt_conversion_function_eval_test(samples, auxiliary_model, tokenizer, input_prompt = 'generate questions', threshold=threshold)

  train_dataset = train_dataset.map(train_prompt_converter, batched=True, batch_size=64).remove_columns(['sentence_answer'])
  validation_dataset = validation_dataset.map(eval_test_prompt_converter, batched=True, batch_size=64).remove_columns(['sentence_answer'])
  test_dataset = test_dataset.map(eval_test_prompt_converter, batched=True, batch_size=64).remove_columns(['sentence_answer'])

  tokenizer_function_lambda = lambda sample: tokenizer_function(sample, train_dataset=train_dataset)
  tokenized_train_dataset = train_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_validation_dataset = validation_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_test_dataset = test_dataset.map(lambda samples: tokenizer_function(samples, input_ids_padding="max_length"), batched=True).remove_columns(['answer_context', 'question'])

  tokenized_train_dataset.set_format("torch")
  tokenized_validation_dataset.set_format("torch")
  tokenized_test_dataset.set_format("torch")

  return (train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset)

In [16]:
USE_CACHE = os.path.exists(train_dataset_path)

In [17]:
USE_CACHE

True

In [18]:
if not USE_CACHE:
  auxiliary_model = MT5ForSequenceClassification.from_pretrained(auxiliary_model_path)
  print(auxiliary_model.config)
  auxiliary_model.to(device)
  (train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset) = \
    load_and_preprocess_lmqg_squad_auxiliary_classifier_top1accuracy(auxiliary_model, tokenizer, train_select=10000)
  del auxiliary_model

In [19]:
if USE_CACHE:
  train_dataset = load_from_disk(train_dataset_path).shuffle(seed=0).select(range(10000))
  validation_dataset = load_from_disk(validation_dataset_path).shuffle(seed=0).select(range(5000))
  test_dataset = load_from_disk(test_dataset_path).shuffle(seed=0)

  tokenizer_function_lambda = lambda sample: tokenizer_function(sample, train_dataset=train_dataset)
  tokenized_train_dataset = train_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_validation_dataset = validation_dataset.map(tokenizer_function_lambda, batched=True).remove_columns(['answer_context', 'question'])
  tokenized_test_dataset = test_dataset.map(lambda samples: tokenizer_function(samples, input_ids_padding="max_length"), batched=True).remove_columns(['answer_context', 'question'])

  tokenized_train_dataset.set_format("torch")
  tokenized_validation_dataset.set_format("torch")
  tokenized_test_dataset.set_format("torch")

In [20]:
if not USE_CACHE:
  train_dataset.save_to_disk(train_dataset_path)
  validation_dataset.save_to_disk(validation_dataset_path)
  test_dataset.save_to_disk(test_dataset_path)

### 3. Model Loading and configuration

#### Loading

In [None]:
model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint)

In [None]:
tokenizer.add_special_tokens({'additional_special_tokens': ['<hl>', '<answer>', '<context>']})

2

In [None]:
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 250104. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(250104, 768)

#### Configuration

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
  num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-4, num_epochs=NUM_EPOCHS)

25000




### 4. Fine-tuning

#### Execution

In [None]:
training_results_dict = main_training_loop(
    model, device, optimizer, train_dataloader, eval_dataloader,
    lr_scheduler, num_training_steps, num_epochs=NUM_EPOCHS, metrics=None,
    eval_strategy='epoch', eval_every=2000, model_save_path='lmqg_squad_reduced_mt5base_top2accuracy_classifier_thresholding_teacher_forcing_10k_examples_test',
    early_stopping=False, early_stopping_patience=4, tokenizer=tokenizer,
    num_beams=4, tokenize_predictions_output=False, start_epoch=0
  )

epoch_train_losses = training_results_dict['epoch_train_losses']
epoch_eval_losses = training_results_dict['epoch_eval_losses']
epoch_eval_metrics = training_results_dict['epoch_eval_metrics']
print(epoch_train_losses, epoch_eval_losses, epoch_eval_metrics, sep='\n')

  0%|          | 0/25000 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: Train Loss = 3.4392521381378174, Eval Loss = 2.3437116146087646
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 1: Train Loss = 1.7151696681976318, Eval Loss = 1.964227557182312
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 2: Train Loss = 1.3623937368392944, Eval Loss = 1.8675459623336792
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 3: Train Loss = 1.606215476989746, Eval Loss = 1.753062129020691
Save this model (y/n)?> n
Continue training (y/n)?> y
Epoch 4: Train Loss = 1.2041617631912231, Eval Loss = 1.8156511783599854
Save this model (y/n)?> y
Continue training (y/n)?> y
Epoch 5: Train Loss = 1.5937663316726685, Eval Loss = 1.7860091924667358
Save this model (y/n)?> y
Continue training (y/n)?> y
Epoch 6: Train Loss = 1.9100970029830933, Eval Loss = 1.8281059265136719
Save this model (y/n)?> n
Continue training (y/n)?> n
[3.4392521381378174, 1.7151696681976318, 1.3623937368392944, 1.606215476989746, 1.2041617631912231, 1.593766331672

In [None]:
model.save_pretrained('lmqg_squad_reduced_mt5base_top2accuracy_classifier_thresholding_teacher_forcing_10k_examples_test_final')

In [None]:
save_checkpoint(
    'lmqg_squad_reduced_mt5base_top2accuracy_classifier_teacher_forcing_10k_examples_test_epoch2_checkpoint.pth',
    'lmqg_squad_reduced_mt5base_top2accuracy_classifier_teacher_forcing_10k_examples_test_final',
    optimizer, lr_scheduler, num_training_steps, save_model=False,
)

### 5. Analysis of the Results

### Calculating BLEU and ROUGE score

In [None]:
metrics = {
    'bleu': get_bleu_config(tokenizer),
    'nist_m': get_nist_config(tokenizer),
    'rouge': get_rouge_config(tokenizer),
}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [25]:
P = True

In [26]:
if P:
  model = MT5ForConditionalGeneration.from_pretrained('lmqg_squad_reduced_mt5base_top2accuracy_classifier_thresholding_teacher_forcing_10k_examples_test_epoch5', local_files_only=True)

In [27]:
if P:
  optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
    num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-3, num_epochs=2)

5000




In [28]:
if P:
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)
  print()




In [29]:
tokenized_test_dataset = tokenized_test_dataset.shuffle(seed=42)#.select(range(1000))
test_dataset = test_dataset.shuffle(seed=42)#.select(range(1000))

In [30]:
def evaluation_loop(
    model, device, optimizer, eval_dataloader, lr_scheduler,
    loss_tracker, metrics_tracker=None, metrics=None, progress_bar=None,
    tokenizer=None, num_beams=1, top_k=None, top_p=None, num_candidates=4,
    score_function=bertscore_f1based_score, tokenize_predictions_output=True,
):
    model.eval()
    current_loss = 0
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'question' in batch:
          text_references = batch['question']
        else:
          labels_batch = torch.tensor(batch['labels'])
          labels_batch[labels_batch == -100] = tokenizer.pad_token_id
          text_references = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
        with torch.no_grad():
            outputs = model(**batch)
        current_loss = outputs.loss.item()
        if metrics:
          predictions = select_best_output(
              model, tokenizer, batch['input_ids'], text_references, score_function, max_length=100, num_beams=num_beams,
              top_k=top_k, top_p=top_p, num_candidates=num_candidates, verbose=False, tokenize_output=tokenize_predictions_output,
          )
          for metric_name, (metric, conversion_function_predictions, conversion_function_references) in metrics.items():
            conversion_function_predictions = std_conversion_predictions if conversion_function_predictions == 'text' else conversion_function_predictions
            converted_predictions = conversion_function_predictions(predictions) if tokenize_predictions_output else predictions
            references = text_references if conversion_function_references == 'text' else conversion_function_references(batch["labels"])
            metric.add_batch(predictions=converted_predictions, references=references)
        if progress_bar:
          progress_bar.update(1)
    loss_tracker.append(current_loss)
    if metrics:
      metrics_tracker.append({
        metric_name: metric.compute() for metric_name, (metric, _, _) in metrics.items()
      })
      print(f"Metrics = {metrics_tracker[-1]}")
    return current_loss


In [31]:
tokenized_test_dataset = tokenized_test_dataset.shuffle(seed=42)#.select(range(1000))
test_dataset = test_dataset.shuffle(seed=42)#.select(range(1000))

In [None]:
from tqdm.auto import tqdm
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
test_loss_tracker, test_metrics_tracker, num_test_steps = [], [], len(test_dataloader)
test_progress_bar = tqdm(range(num_test_steps))
test_loss = evaluation_loop(
    model, device, optimizer, test_dataloader, lr_scheduler, test_loss_tracker, test_metrics_tracker, metrics, test_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/1485 [00:00<?, ?it/s]

  labels_batch = torch.tensor(batch['labels'])


Metrics = {'bleu': {'bleu': 0.22264626302377383, 'precisions': [0.5287656864011918, 0.2732232675500894, 0.17419695943752378, 0.11540466163532502], 'brevity_penalty': 0.9590785837642166, 'length_ratio': 0.9598934773092076, 'translation_length': 131563, 'reference_length': 137060}, 'nist_m': {'nist_mt': 5.905227753341725}, 'rouge': {'rouge1': 0.5124838958214604, 'rouge2': 0.2942298697873853, 'rougeL': 0.4755778882569241, 'rougeLsum': 0.4756180022336445}}


In [None]:
test_loss

2.096113681793213

In [None]:
from tqdm.auto import tqdm
eval_dataloader = DataLoader(tokenized_validation_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
eval_loss_tracker, eval_metrics_tracker, num_eval_steps = [], [], len(eval_dataloader)
eval_progress_bar = tqdm(range(num_eval_steps))
eval_loss = evaluation_loop(
    model, device, optimizer, eval_dataloader, lr_scheduler, eval_loss_tracker, eval_metrics_tracker, metrics, eval_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/625 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  labels_batch = torch.tensor(batch['labels'])


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics = {'bleu': {'bleu': 0.24785819751397403, 'precisions': [0.5458453757225433, 0.29447974583002384, 0.19466490299823633, 0.13404360753221012], 'brevity_penalty': 0.9739543003939478, 'length_ratio': 0.9742876753313036, 'translation_length': 55360, 'reference_length': 56821}, 'nist_m': {'nist_mt': 6.056363368190112}, 'rouge': {'rouge1': 0.5361539209323202, 'rouge2': 0.3207442541752707, 'rougeL': 0.49991345758257655, 'rougeLsum': 0.5002407895333322}}


In [None]:
eval_loss

1.0809094905853271

In [None]:
test_dataset[18:30]['question']

['Most breeds share a genetic likeness to what animal?',
 'What did Margaret Thatcher reconfigure following each general election?',
 'Where did the creator of Mr Benn and King Rollo study?',
 'Chopin was able to bring about a new sense of nationalism with his music because of his mazurkas and what?',
 'What country does Akiko Komoto come from?',
 'What are the eight factors of the Noble Eightfold Path?',
 'Who did Ü-Tsang king have an alliance with?',
 'The City University of New York system consists of how many institutions?',
 'What show helped launched the career of Kelly Clarkson?',
 'What contains material ofen described as systematic expositions of the Gautama Buddha teachings?',
 'In addition to cyberctm.com, what other website was shut down for two days?',
 'How were the Canadian Forces upgraded in 2008?']

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][18:30].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

['What is the genetic closeness to the breeds?',
 'What did Margaret Thatcher reshuffle after a general election?',
 'Where did David McKee train?',
 'What types of music has Chopin credited with introducing to music a new sense of nationalism?',
 'What country was Akiko Kōmoto a voice actress?',
 'What are the 8 interconnected factors of the Noble Eightfold Path?',
 'Who was the king of Lhasa along with?',
 'How many universities are in New York City?',
 'What is one of the most successful television shows?',
 "The Pāli Tipitaka contains material often described as systematic expositions of the Gautama Buddha's teachings?",
 'What internet forums were shut down from May 2 to 4?',
 'What did the government of Canada make efforts to modernize the Canadian Armed Forces in 2008?']

In [None]:
bleu = load('bleu')

In [None]:
bleu.compute(predictions=final_predictions, references=test_dataset['question'][18:30])

{'bleu': 0.21136550387922054,
 'precisions': [0.5367647058823529,
  0.2661290322580645,
  0.16071428571428573,
  0.11],
 'brevity_penalty': 0.9428731438548749,
 'length_ratio': 0.9444444444444444,
 'translation_length': 136,
 'reference_length': 144}

In [None]:
bert_score = load('bertscore')

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][18:30], model_type='xlm-roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

{'precision': [0.9108445048332214,
  0.9329830408096313,
  0.8902857303619385,
  0.9112851619720459,
  0.870680570602417,
  0.9556872844696045,
  0.8790223598480225,
  0.924476146697998,
  0.8759087324142456,
  0.9162879586219788,
  0.8859341144561768,
  0.9033517241477966],
 'recall': [0.8971648216247559,
  0.9369997978210449,
  0.8506810665130615,
  0.8850631713867188,
  0.8748648166656494,
  0.9833491444587708,
  0.8594237565994263,
  0.8937503099441528,
  0.8599473834037781,
  0.9462034106254578,
  0.861170768737793,
  0.9350996017456055],
 'f1': [0.9039528965950012,
  0.9349871277809143,
  0.8700329065322876,
  0.8979828357696533,
  0.8727676868438721,
  0.9693209528923035,
  0.8691125512123108,
  0.9088536500930786,
  0.867854654788971,
  0.9310054183006287,
  0.8733769655227661,
  0.9189515113830566],
 'hashcode': 'xlm-roberta-base_L9_no-idf_version=0.3.12(hug_trans=4.33.0)'}

In [32]:
bertscore = compute_bert_score(
    test_dataset, tokenized_test_dataset, model, device, tokenizer, batch_size=16,
    lang='en', model_type=None, max_length=200, num_beams=4, num_candidates=1
)

  0%|          | 0/743 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
bertscore

{'precision': (0.917358377158722, 0.010126201443917477),
 'recall': (0.9144110330070895, 0.010196995530901462),
 'f1': (0.9156928794110046, 0.009592311673358361)}

In [34]:
np.mean([v[0] for v in bertscore.values()])

0.9158207631922721