<a href="https://colab.research.google.com/github/Servat0r/HLT-Project-2023/blob/master/LMQGSquadFullTrainingMT5BaseSquadITTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

An example of QG finetuned T5 model (`t5-base`) over a variant of SquaD V1 dataset for Question Generation.

### 1. Preliminaries

#### Mounting and Installing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks"

/content/drive/MyDrive/Colab Notebooks


In [None]:
%run utils.ipynb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.3 MB/s[0m eta

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
NUM_EPOCHS = 4

#### Imports

In [None]:
from transformers import AutoModel, AutoTokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer, AdamW, DataCollatorWithPadding
from datasets import load_dataset, Dataset, load_from_disk, load_metric
import numpy as np
import evaluate
import torch
import os

In [None]:
model_checkpoint='google/mt5-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
tokenizer.add_special_tokens({'sep_token': '<sep>', 'additional_special_tokens': ['<hl>', '<answer>', '<context>']})

4

### 2. Dataset Loading and Preprocessing

We will use [`squad_it`](https://huggingface.co/datasets/squad_it), a variant of the `SquaD` dataset adapted to `Question Generation` tasks.

In [None]:
(train_dataset, validation_dataset, test_dataset), (tokenized_train_dataset, tokenized_validation_dataset, tokenized_test_dataset) = \
  load_and_preprocess_squad_it_dataset(shuffle_seed=42, use_extra_ids=True)

Map:   0%|          | 0/43328 [00:00<?, ? examples/s]

Map:   0%|          | 0/10831 [00:00<?, ? examples/s]

Map:   0%|          | 0/10231 [00:00<?, ? examples/s]

### 3. Preliminary Test over Squad-IT

In [None]:
metrics = {
    'bleu': get_bleu_config(tokenizer),
    'nist_m': get_nist_config(tokenizer),
    'rouge': get_rouge_config(tokenizer),
}

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.53k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
model = MT5ForConditionalGeneration.from_pretrained('lmqg_squad_reduced_full_training_mt5base_example', local_files_only=True)

In [None]:
optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
  num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-4, num_epochs=NUM_EPOCHS)

43328




In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print()




In [None]:
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['question'])

In [None]:
tokenized_test_dataset = tokenized_test_dataset.shuffle(seed=42)#.select(range(1000))
test_dataset = test_dataset.shuffle(seed=42)#.select(range(1000))

In [None]:
from tqdm.auto import tqdm
eval_dataloader = DataLoader(tokenized_validation_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
eval_loss_tracker, eval_metrics_tracker, num_eval_steps = [], [], len(eval_dataloader)
eval_progress_bar = tqdm(range(num_eval_steps))
eval_loss = evaluation_loop(
    model, device, optimizer, eval_dataloader, lr_scheduler, eval_loss_tracker, eval_metrics_tracker, metrics, eval_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/1354 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  labels_batch = torch.tensor(batch['labels'])


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics = {'bleu': {'bleu': 0.06829556408764514, 'precisions': [0.28060654017511566, 0.09945464029971073, 0.04885628514650543, 0.026130137395103434], 'brevity_penalty': 0.883987762049635, 'length_ratio': 0.8902245737081078, 'translation_length': 116266, 'reference_length': 130603}, 'nist_m': {'nist_mt': 2.809006198497302}, 'rouge': {'rouge1': 0.21991706250644621, 'rouge2': 0.09256039031414111, 'rougeL': 0.20482671817636755, 'rougeLsum': 0.2048730064929369}}


In [None]:
from tqdm.auto import tqdm
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
test_loss_tracker, test_metrics_tracker, num_test_steps = [], [], len(test_dataloader)
test_progress_bar = tqdm(range(num_test_steps))
test_loss = evaluation_loop(
    model, device, optimizer, test_dataloader, lr_scheduler, test_loss_tracker, test_metrics_tracker, metrics, test_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/1279 [00:00<?, ?it/s]

  labels_batch = torch.tensor(batch['labels'])


Metrics = {'bleu': {'bleu': 0.06632307991740244, 'precisions': [0.26651424981238764, 0.09052669905238246, 0.04560423217472593, 0.024670758596031225], 'brevity_penalty': 0.9188476164482611, 'length_ratio': 0.9219691545250489, 'translation_length': 114598, 'reference_length': 124297}, 'nist_m': {'nist_mt': 2.6398671482837535}, 'rouge': {'rouge1': 0.20610073231663242, 'rouge2': 0.08495996375717055, 'rougeL': 0.19218881169468194, 'rougeLsum': 0.19224407685860487}}


In [None]:
test_dataset[18:30]['question']

["Qual è stato l' unico anno in cui una serie completa non è stata filmata dal 2005?",
 'Nastro rosso nel logo sono stati utilizzati per rappresentare quale divisione di ABC?',
 'Qual è il dialetto di Newcastle conosciuto come?',
 'Quante stagioni ha eseguito il Dottore originale Who?',
 "Quale percentuale del patrimonio globale nel 2000 era di proprietà di appena l' 1% degli adulti?",
 'A cosa cambia il nome del Nederrijn?',
 'Quali cambiamenti di condizioni possono rendere insostenibile la foresta pluviale amazzonica?',
 'Qual è la lingua parlata in Kenya?',
 'Il presidente Johnson ha emesso un ordine esecutivo per rinominare il Launch Operations Center dopo di che?',
 "Che cosa significa l' inno di Lutero Christ unser Herr zum jordan kam concern?",
 'Quanto sono grandi i ficobilisomi?',
 'Quale tipo di potere è stato mostrato alla fiera mondiale da Westinghouse e Tesla?']

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][18:30].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

["When was 'Rose' filmed?",
 'In 2007, i nastri rossi were used for representing what division?',
 'What is the name of the dialetto di Newcastle?',
 'How many stagioni did Dottore have?',
 'How much of the patrimonio globale does the più ricco 1% degli adulti possess in 2000?',
 'What scorre più a ovest, per ricongiungersi al fiume Noord e raggiunger il Nieuwe Maas?',
 'What causes a foresta pluviale amazzonica to become insostenibile?',
 'Does each tribe speak their languages?',
 'Who did Johnson elect to rinominare the LOC e Cape Canaveral?',
 'What did Lutero\'s inno "Christ unser Herr zum Jordan kam" reflect la structure e la sostanza delle sue domande e risposte?',
 'How large are the ficobilisomi?',
 "Which system did George Westinghouse use to illumine l' Esposizione Colombiana?"]

In [None]:
bleu = load('bleu')

In [None]:
bleu.compute(predictions=final_predictions, references=test_dataset['question'][18:30])

In [None]:
bert_score = load('bertscore')

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][18:30], lang='en')

In [None]:
test_dataset[108:120]['question']

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][108:120].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

In [None]:
from time import sleep
sleep(3600)

### 3. Model Loading and configuration

### 4. Fine-tuning

#### Execution

In [None]:
training_results_dict = main_training_loop(
    model, device, optimizer, train_dataloader, eval_dataloader,
    lr_scheduler, num_training_steps, num_epochs=NUM_EPOCHS, metrics=None,
    eval_strategy='epoch', eval_every=2000, model_save_path='lmqg_squad_full_training_mt5base_on_italian_test',
    early_stopping=False, early_stopping_patience=4, tokenizer=tokenizer,
    num_beams=4, tokenize_predictions_output=False, start_epoch=0
  )

epoch_train_losses = training_results_dict['epoch_train_losses']
epoch_eval_losses = training_results_dict['epoch_eval_losses']
epoch_eval_metrics = training_results_dict['epoch_eval_metrics']
print(epoch_train_losses, epoch_eval_losses, epoch_eval_metrics, sep='\n')

  0%|          | 0/43328 [00:00<?, ?it/s]

  0%|          | 0/10832 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  labels_batch = torch.tensor(batch['labels'])


Epoch 0: Train Loss = 1.7495408058166504, Eval Loss = 1.8420732021331787
Save this model (y/n)?> y
Continue training (y/n)?> y


  labels_batch = torch.tensor(batch['labels'])


Epoch 1: Train Loss = 2.00000262260437, Eval Loss = 2.0364041328430176
Save this model (y/n)?> n
Continue training (y/n)?> n
[1.7495408058166504, 2.00000262260437]
[1.8420732021331787, 2.0364041328430176]
[]


In [None]:
model.save_pretrained('lmqg_squad_full_training_mt5base_on_italian_test_final')

In [None]:
def save_checkpoint(checkpoint_path, model_path, optimizer, lr_scheduler, num_training_steps, model=None, save_model=False):
  checkpoint = {
      'model_path': model_path,
      'optimizer': optimizer.state_dict(),
      'lr_scheduler': lr_scheduler.state_dict(),
  }
  checkpoint['num_training_steps'] = num_training_steps - checkpoint['lr_scheduler']['_step_count'] + 1
  if save_model:
    checkpoint['model'] = model
  torch.save(checkpoint, checkpoint_path)

In [None]:
save_checkpoint('lmqg_squad_reduced_full_training_mt5base_on_italian_checkpoint.pth', 'lmqg_squad_reduced_full_training_mt5base_on_italian_test_final', optimizer, lr_scheduler, num_training_steps, save_model=False)

### 5. Analysis of the Results

### Calculating BLEU and ROUGE score

In [None]:
metrics = {
    'bleu': get_bleu_config(tokenizer),
    'nist_m': get_nist_config(tokenizer),
    'rouge': get_rouge_config(tokenizer),
}

In [None]:
P = True

In [None]:
if P:
  model = MT5ForConditionalGeneration.from_pretrained('lmqg_squad_full_training_mt5base_on_italian_test_epoch0', local_files_only=True)

In [None]:
if P:
  optimizer, train_dataloader, eval_dataloader, lr_scheduler, \
    num_training_steps = get_training_configuration(train_batch_size=4, eval_batch_size=4, tokenizer=tokenizer, learning_rate=1e-3, num_epochs=2)

21664




In [None]:
if P:
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  model.to(device)
  print()




In [None]:
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['question'])

In [None]:
from tqdm.auto import tqdm
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=True, batch_size=8, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
test_loss_tracker, test_metrics_tracker, num_test_steps = [], [], len(test_dataloader)
test_progress_bar = tqdm(range(num_test_steps))
test_loss = evaluation_loop(
    model, device, optimizer, test_dataloader, lr_scheduler, test_loss_tracker, test_metrics_tracker, metrics, test_progress_bar,
    tokenizer=tokenizer, num_beams=4, num_candidates=4, tokenize_predictions_output=False,
)

  0%|          | 0/1279 [00:00<?, ?it/s]

  labels_batch = torch.tensor(batch['labels'])


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics = {'bleu': {'bleu': 0.1991173611814901, 'precisions': [0.4758254817078347, 0.251108487716004, 0.1577947158647327, 0.10391771984802048], 'brevity_penalty': 0.9464252965707193, 'length_ratio': 0.9478104861742439, 'translation_length': 117810, 'reference_length': 124297}, 'nist_m': {'nist_mt': 5.303942373765475}, 'rouge': {'rouge1': 0.4374924547088503, 'rouge2': 0.2518568221881994, 'rougeL': 0.4094566147592525, 'rougeLsum': 0.4093225512394313}}


In [None]:
test_dataset[18:30]['question']

['Dal 1947 al 1967, quanto è aumentato il prezzo del petrolio?',
 'Dal 1947 al 1967, quanto è aumentato il prezzo del petrolio?',
 "Quando ha iniziato l' OPEC a riadattare i prezzi del petrolio?",
 "Quando ha iniziato l' OPEC a riadattare i prezzi del petrolio?",
 "Quando l' olio è tornato ai suoi livelli di Bretton Woods?",
 "Quando la Siria e l' Egitto hanno lanciato un attacco a sorpresa contro Israele?",
 "Quante volte di più le altre nazioni hanno dovuto pagare per il petrolio dopo l' attacco a sorpresa?",
 "Quante volte di più le altre nazioni hanno dovuto pagare per il petrolio dopo l' attacco a sorpresa?",
 "Quante volte di più le altre nazioni hanno dovuto pagare per il petrolio dopo l' attacco a sorpresa?",
 "Perchè lo scià d' Iran ha rilasciato un' intervista?",
 "Perchè l' OPEC ha portato il prezzo del petrolio a 5,11 dollari?",
 'Quando hanno portato il prezzo del petrolio a 5,11 dollari?']

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][18:30].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

['Quanto è aumentato il prezzo del petrolio in dollari dal 1947 al 1967?',
 'Quanto è aumentato il prezzo del petrolio in dollari dal 1947 al 1967?',
 "Quando l' OPEC ha tardato ad adeguare i prezzi per riflettere il deprezzamento?",
 "Quando l' OPEC ha tardato ad adeguare i prezzi?",
 "In quale anno i ministri dell' OPEC hanno aumentato i prezzi e i redditi ai livelli di Bretton Woods?",
 "Quando la Siria e l' Egitto hanno lanciato un attacco a sorpresa su Israele?",
 'Quale nazione era il secondo esportatore mondiale di petrolio?',
 "Quante volte l' Iran ha pagato più per il petrolio?",
 "Quanto più petrolio ha pagato l' Iran per il petrolio?",
 "In quale data l' Iran ha lanciato un attacco a sorpresa su Israele?",
 "Perché l' OPEC ha aumentato il prezzo del petrolio?",
 "Quando l' OPEC ha aumentato il prezzo del petrolio?"]

In [None]:
bleu = load('bleu')

In [None]:
bleu.compute(predictions=final_predictions, references=test_dataset['question'][18:30])

{'bleu': 0.26661041178266703,
 'precisions': [0.5816993464052288,
  0.3404255319148936,
  0.23255813953488372,
  0.1623931623931624],
 'brevity_penalty': 0.906613349592095,
 'length_ratio': 0.9107142857142857,
 'translation_length': 153,
 'reference_length': 168}

In [None]:
bert_score = load('bertscore')

In [None]:
bert_score.compute(predictions=final_predictions, references=test_dataset['question'][18:30], lang='en')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.9262901544570923,
  0.9262901544570923,
  0.8964501023292542,
  0.9376155138015747,
  0.8751688599586487,
  0.9920855760574341,
  0.8578260540962219,
  0.938665509223938,
  0.9087055325508118,
  0.8432421684265137,
  0.9674186706542969,
  0.9292482137680054],
 'recall': [0.9564969539642334,
  0.9564969539642334,
  0.9082342386245728,
  0.9040244221687317,
  0.8837687373161316,
  0.9920855760574341,
  0.8107935786247253,
  0.8659458160400391,
  0.8393846750259399,
  0.8463293313980103,
  0.933851420879364,
  0.9076690673828125],
 'f1': [0.9411512613296509,
  0.9411512613296509,
  0.9023036956787109,
  0.9205136299133301,
  0.8794477581977844,
  0.9920855760574341,
  0.8336470127105713,
  0.9008405208587646,
  0.872670590877533,
  0.8447829484939575,
  0.9503386616706848,
  0.9183318614959717],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.33.0)'}

In [None]:
test_dataset[108:120]['question']

['Quando Honda, Toyota e Nissan hanno aperto impianti di assemblaggio negli Stati Uniti?',
 'Quando Honda, Toyota e Nissan hanno aperto impianti di assemblaggio negli Stati Uniti?',
 "Nome di un' auto più grande che Toyota ha creato come acquirenti lamentato le piccole dimensioni compatte?",
 "Nome di un' auto più grande che Toyota ha creato come acquirenti lamentato le piccole dimensioni compatte?",
 'Denominare un tipo di camion compatti Toyota?',
 'Denominare un tipo di camion compatti Toyota?',
 'Che cosa ha fatto Mitsubishi ribattezzato il suo Forte?',
 'Mazda, Mitsubishi e Isuzu hanno aderito alla partnership con quale casa automobilistica americana?',
 'Mazda, Mitsubishi e Isuzu hanno aderito alla partnership con quale casa automobilistica americana?',
 'Quando le case automobilistiche americane si sono schierate con le loro auto sostitutive nazionali, quale politica si è conclusa?',
 'Quanti passeggeri può ospitare la Ford Fiesta?',
 'Quanti passeggeri può ospitare la Ford Fies

In [None]:
final_predictions = select_best_output(
    model, tokenizer, tokenized_test_dataset['input_ids'][108:120].to(device), test_dataset[18:30]['question'], score_function=bertscore_f1based_score,
    max_length=64, num_beams=4, top_k=None, top_p=None, num_candidates=4, verbose=False, tokenize_output=False
)

In [None]:
final_predictions

["Quando è stato aperto l' impianto di assemblaggio statunitense?",
 "In quale anno è stata la restrizione volontaria all' esportazione di Nissan?",
 "Qual è stato il nome dell' auto più grande di Toyota?",
 "Qual è stato il nome dell' auto più grande di Toyota?",
 "Qual è stato il nome dell' autocarro compatti?",
 "Qual è stato il nome dell' autocarro compatti della Toyota?",
 'Qual è stato il nome del forte di Mitsubishi?',
 'Quali aziende hanno collaborato con Mazda e Isuzu?',
 'Quale azienda ha sostituito il camion Mazda?',
 'Che tipo di politica hanno adottato i produttori americani?',
 'Quanti passeggeri hanno avuto la Ford Fiesta e la Chevrolet Chevette?',
 'Quanti passeggeri hanno avuto la Ford Fiesta e la Chevrolet Chevette?']

In [None]:
bertscore = compute_bert_score(
    test_dataset, tokenized_test_dataset, model, device, tokenizer, batch_size=16,
    lang='en', model_type=None, max_length=200, num_beams=4, num_candidates=1
)

  0%|          | 0/640 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
bertscore

{'precision': (0.8594681024077415, 0.016372987556414777),
 'recall': (0.866242572930475, 0.01608641018379286),
 'f1': (0.8625804557980571, 0.014883528373100353)}

In [None]:
np.mean([v[0] for v in bertscore.values()])

0.8627637103787579