# Install

In [None]:
%%capture
!pip install transformers datasets sentencepiece sacrebleu rouge_score bert_score

# Configuration

All changeable parameters can be set here

In [None]:
language = "en" #@param ["en", "de"]

In [None]:
qg_type = "answer-agnostic" #@param ["answer-agnostic", "answer-aware"]

Decoding parameters from 
https://github.com/patil-suraj/question_generation/blob/c30e2976d65c4ef6200c7504097e8e07545fb240/pipelines.py#L221

In [None]:
answer_agnostic_generation_args = {
    "max_length": 128,
    "num_beams": 4,
    "length_penalty": 1.5, # old results bug commented out
    "no_repeat_ngram_size": 3, # old results bug commented out
    "early_stopping": True,
}

https://github.com/patil-suraj/question_generation/blob/c30e2976d65c4ef6200c7504097e8e07545fb240/pipelines.py#L67

In [None]:
answer_aware_generation_args = {
    "max_length": 32,
    "num_beams": 4,
}

In [None]:
generation_args = answer_agnostic_generation_args if qg_type == "answer-agnostic" else answer_aware_generation_args

# Downloads

In [None]:
import requests
from pathlib import Path
from tqdm.auto import tqdm

def download_from_url(url: str) -> str:
    request = requests.get(
        url, 
        allow_redirects=True, 
        stream=True)
    total_size_in_bytes= int(request.headers.get('content-length', 0))
    block_size = 2**10 # 1 Kibibyte

    filename = url.split("/")[-1]

    with tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) as progress_bar:
        with open(filename, 'wb') as f:
            for data in request.iter_content(block_size):
                progress_bar.update(len(data))
                f.write(data)

    return filename

In [None]:
import tarfile
import zipfile

def unpack_download(filename: str) -> None:
    if ".tar.gz" in filename:
        with tarfile.open(filename, 'r:gz') as tar_ref:
            for file in tqdm(iterable=tar_ref.getmembers(), total=len(tar_ref.getmembers())):
                tar_ref.extract(member=file)
    elif ".zip" in filename:
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            for file in tqdm(iterable=zip_ref.namelist(), total=len(zip_ref.namelist())):
                zip_ref.extract(member=file)
    else:
        raise ValueError(f"Unknown file extension '{filename}'.")

METEOR Score

In [None]:
meteor = 'http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz'
print("Downloading", meteor)
filename = download_from_url(meteor)
print("Unpacking", meteor)
unpack_download(filename)

Downloading http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz


  0%|          | 0.00/224M [00:00<?, ?iB/s]

Unpacking http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz


  0%|          | 0/161 [00:00<?, ?it/s]

Evaluation dataset xquad

In [None]:
xquad_download_link = "https://github.com/deepmind/xquad/archive/refs/heads/master.zip" # From https://github.com/deepmind/xquad
print("Downloading", xquad_download_link)
filename = download_from_url(xquad_download_link)
print("Unpacking", filename)
unpack_download(filename)

Downloading https://github.com/deepmind/xquad/archive/refs/heads/master.zip


0.00iB [00:00, ?iB/s]

Unpacking master.zip


  0%|          | 0/16 [00:00<?, ?it/s]

# Load Data

In [None]:
from pathlib import Path

filename = f"xquad.{language}.json"
qg_eval_test = Path("xquad-master") / filename

Load from a list of transformer models and generate gold reference and model hypothesis text files.

In [None]:
qg_models_path = Path("drive") / "MyDrive" / "mt5" # ToDo Path to models for eval

generated_output_path = Path("evaluation")

answer_agnostic_models_english = list([
    #str(qg_models_path / "M-2-mt5-base-e2e-qg-squad"), # M-2 "cross lingual zero shot transfer" in mt5 paper lr7e-4
    #str(qg_models_path / "mt5-base-e2e-qg-squad-20ep-adamw"),
    #str(qg_models_path / "mt5-base-e2e-qg-squad-lr5e-4"),
    "valhalla/t5-base-e2e-qg", # only squad finetuning
])

answer_agnostic_models_german = list([
    "tilomichel/mT5-base-GermanQuAD-e2e-qg", # M-1 Only finetuned with germanquad training
    #str(qg_models_path / "M-2-mt5-base-e2e-qg-squad"), # M-2 "cross lingual zero shot transfer" in mt5 paper
    #str(qg_models_path / "M-3-mt5-base-e2e-qg-translated-squad"), # M-3 Finetuning nur mit übersetztem SQuAD
    #str(qg_models_path / "M-4-mt5-base-e2e-qg-squad-germanquad"), # M-4 warm-started on squad finetuned germanquad training
    #str(qg_models_path / "M-5-mt5-base-e2e-qg-translated-squad-germanquad"), # M-5 warm-started on translated-squad mlqa further finetuned on germanquad training
    #str(qg_models_path / "M-6-mt5-base-e2e-qg-squad+germanquad"), # M-6 train squad, train germanquad
    #str(qg_models_path / "M-7-mt5-base-e2e-qg-translated-squad+germanquad"), # M-7 train translated-squad, train germanquad
    #str(qg_models_path / "mt5-base-e2e-qg-squad+translated-squad+germanquad"), # train squad, train translated-squad, train germanquad
])

answer_aware_models_german = list([
    #str(qg_models_path / "M-9-mt5-base-hl-qg-germanquad"),
])

model_paths_to_evaluate = None
if language == "en":
    if qg_type == "answer-agnostic":
        model_paths_to_evaluate = answer_agnostic_models_english
    else:
        ValueError(f"Unknown qg_type for {language}")
elif language == "de":
    if qg_type == "answer-agnostic":
        model_paths_to_evaluate = answer_agnostic_models_german
    elif qg_type == "answer-aware":
        model_paths_to_evaluate = answer_aware_models_german
    else:
        ValueError(f"Unknown qg_type for {language}")


In [None]:
import json

def generate_examples_e2e(filepath):
    def highlight_text(text: str, 
                      context: str, 
                      highlight_token: str = "<hl>") -> str:
        return context.replace(text, f"{highlight_token}{text}{highlight_token}", 1) # only replace first occurence
    tokenizer_eos_token = "</s>"
    with open(filepath, encoding="utf-8") as f:
        squad = json.load(f)
        if qg_type == "answer-agnostic":
            for article in squad["data"]:
                title = article.get("title", "")
                for paragraph in article["paragraphs"]:
                    context = paragraph["context"].split("==\n", 1)[-1].replace("\n", " ").replace("''", "'")  # do not strip leading blank spaces GH-2585
                    questions = [qa["question"].strip() for qa in paragraph["qas"]]
                    yield {
                        "prefix": "generate question: ", 
                        "input_text": context,
                        "target_text": questions,
                    }
        elif qg_type == "answer-aware":
            for article in squad["data"]:
                title = article.get("title", "")
                for paragraph in article["paragraphs"]:
                    context = paragraph["context"].split("==\n", 1)[-1].replace("\n", " ").replace("''", "'")  # do not strip leading blank spaces GH-2585
                    for qa in paragraph["qas"]:
                        question = qa["question"].strip()
                        answer = qa["answers"][0]["text"]
                        highlighted_context = highlight_text(answer, context)
                        yield {
                            "prefix": "generate question: ", 
                            "input_text": highlighted_context,
                            "target_text": question,
                        }

In [None]:
import pandas as pd

df_test = pd.DataFrame(list(generate_examples_e2e(str(qg_eval_test.resolve()))))
df_test

Unnamed: 0,prefix,input_text,target_text
0,generate question:,"The Panthers defense gave up just 308 points, ...",[How many points did the Panthers defense surr...
1,generate question:,The Broncos defeated the Pittsburgh Steelers i...,[Who lost to the Broncos in the divisional rou...
2,generate question:,Peyton Manning became the first quarterback ev...,[How old was Peyton Manning when he played in ...
3,generate question:,Six-time Grammy winner and Academy Award nomin...,"[How many Grammys has Lady Gaga won?, What did..."
4,generate question:,"With 4:51 left in regulation, Carolina got the...",[On what yard line did Carolina begin with 4:5...
...,...,...,...
235,generate question:,Aristotle provided a philosophical discussion ...,[Who provided a philosophical discussion of fo...
236,generate question:,The development of fundamental theories for fo...,[Who formed the universal theory of gravitatio...
237,generate question:,"Since then, and so far, general relativity has...","[What theory best explains gravity?, What spac..."
238,generate question:,Through combining the definition of electric c...,[What is the time rate of change of electric ...


For translation pipeline gold reference and hypothesis generation see https://colab.research.google.com/drive/140OhSFN8Pz9xaLgS8qnrLuG35LWR__KV?usp=sharing

In [None]:
import torch

# device (int, optional, defaults to -1) - Device ordinal for CPU/GPU supports. 
# Setting this to -1 will leverage CPU, a positive will run the model on the 
# associated CUDA device id.
device = 0 if torch.cuda.is_available else -1
print(device)

0


Needed to enable progress bars for model inference with pipelines

In [None]:
from torch.utils.data import Dataset
from typing import List

class ListDataset(Dataset):
    def __init__(self, original_list: List[str]):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i: int):
        return self.original_list[i]

In [None]:
context_list = list(df_test["input_text"])
df_test["input_text"]

0      The Panthers defense gave up just 308 points, ...
1      The Broncos defeated the Pittsburgh Steelers i...
2      Peyton Manning became the first quarterback ev...
3      Six-time Grammy winner and Academy Award nomin...
4      With 4:51 left in regulation, Carolina got the...
                             ...                        
235    Aristotle provided a philosophical discussion ...
236    The development of fundamental theories for fo...
237    Since then, and so far, general relativity has...
238    Through combining the definition of electric c...
239    where  is the relevant cross-sectional area fo...
Name: input_text, Length: 240, dtype: object

In [None]:
gold_reference_list = list(df_test["target_text"])
df_test["target_text"]

0      [How many points did the Panthers defense surr...
1      [Who lost to the Broncos in the divisional rou...
2      [How old was Peyton Manning when he played in ...
3      [How many Grammys has Lady Gaga won?, What did...
4      [On what yard line did Carolina begin with 4:5...
                             ...                        
235    [Who provided a philosophical discussion of fo...
236    [Who formed the universal theory of gravitatio...
237    [What theory best explains gravity?, What spac...
238    [What is  the time rate of change of electric ...
239    [What causes strain in structures?, What is us...
Name: target_text, Length: 240, dtype: object

# Auxiliary & metric functions

In [None]:
from typing import List
import pandas as pd

def write_to_file(file_name: str, items: List[str]) -> None:
  with open(file_name, 'w') as f:
    for item in items[:-1]:
      f.write(f"{item}\n")
    f.write(f"{items[-1]}")

In [None]:
from sacrebleu.metrics import BLEU
from typing import List, Dict, Any
import pandas as pd

def calc_bleu(hyp: List[str], refs: List[str]) -> Dict[str, Any]:
    assert len(hyp) == len(refs)
    bleu = BLEU(lowercase=True)
    result = bleu.corpus_score(hyp, [refs]) # list of list refs
    return {
        "BLEU_score": result.score,
        "BLEU-1": result.precisions[0],
        "BLEU-2": result.precisions[1],
        "BLEU-3": result.precisions[2],
        "BLEU-4": result.precisions[3],
        "BLEU-brevity-penalty": result.bp,
        "BLEU-hyp-ref-ratio": result.sys_len / result.ref_len,
        "BLEU-hyp-len": result.sys_len,
        "BLEU-ref-len": result.ref_len,
        "counts": result.counts,
        "totals": result.totals,
        "precisions": result.precisions,
    }

In [None]:
import datasets
import pandas as pd

def calc_rouge(hyp: List[str], refs: List[str]) -> Dict[str, Any]:
    assert len(hyp) == len(refs)
    # Umlaute have to be replaced otherwise words are split at them
    def replace_umlaute(text: str) -> str:
        return (text.replace("ä", "ae")
                    .replace("ö", "oe")
                    .replace("ü", "ue")
                    .replace("ß", "ss"))
    hyp_no_umlaute = [replace_umlaute(h) for h in hyp]
    refs_no_umlaute = [replace_umlaute(r) for ref in [refs] for r in ref]

    metric = datasets.load_metric('rouge')
    metric.add_batch(predictions=hyp_no_umlaute, references=refs_no_umlaute)

    result = metric.compute(
        rouge_types=['rouge1', 'rouge2', 'rougeL'],
        use_aggregator=True, # aggregate scores
        use_stemmer=(language == "en")) # Porter-Stemmer (only english)
    return {
        'ROUGE-1-precision': result['rouge1'].mid.precision,
        'ROUGE-1-recall': result['rouge1'].mid.recall,
        'ROUGE-1-fmeasure': result['rouge1'].mid.fmeasure,
        'ROUGE-2-precision': result['rouge2'].mid.precision,
        'ROUGE-2-recall': result['rouge2'].mid.recall,
        'ROUGE-2-fmeasure': result['rouge2'].mid.fmeasure,
        'ROUGE-L-precision': result['rougeL'].mid.precision,
        'ROUGE-L-recall': result['rougeL'].mid.recall,
        'ROUGE-L-fmeasure': result['rougeL'].mid.fmeasure,
    }

In [None]:
import subprocess

def calc_meteor(hyp_file_path: str, ref_file_path: str, langcode: str = language):
    METEOR_JAR = 'meteor-1.5/meteor-1.5.jar'
    meteor_cmd = [
        "java",
        "-jar",
        "-Xmx2G",
        METEOR_JAR,
        hyp_file_path,
        ref_file_path,
        '-l',
        langcode
    ]
    process = subprocess.run(meteor_cmd, capture_output=True)
    output = process.stdout.decode("utf-8")
    splitted_output = output.split("\n")
    test = [output.split(":") for output in splitted_output]
    output_dict = {}
    for item in test:
      if(len(item) == 2):
        output_dict.update({item[0].strip(): item[1].strip()})

    return {
        'METEOR-precision': output_dict['Precision'],
        'METEOR-recall': output_dict['Recall'],
        'METEOR-f1': output_dict['f1'],
        'METEOR-fMean': output_dict['fMean'],
        'METEOR-fragmentation-penalty': output_dict['Fragmentation penalty'],
        'METEOR-score': output_dict['Final score']
    }

In [None]:
from bert_score import BERTScorer
import numpy as np

scorer = BERTScorer(lang=language, rescale_with_baseline=True)

def calc_bert_score(hyp: List[str], refs: List[str]):
    P, R, F1 = scorer.score(hyp, refs)
    return {
        "bertscore": np.mean(F1.tolist())
    }

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Evaluation loop

In [None]:
from transformers import pipeline, set_seed
from tqdm.auto import tqdm
import pathlib
import pandas as pd
import torch
import gc

results_df = pd.DataFrame(
    columns=['model_path', 
             'BLEU_score',
             'BLEU-1',
             'BLEU-2',
             'BLEU-3',
             'BLEU-4',
             'BLEU-brevity-penalty',
             'BLEU-hyp-ref-ratio',
             'BLEU-hyp-len',
             'BLEU-ref-len',
             'ROUGE-1-precision',
             'ROUGE-1-recall',
             'ROUGE-1-fmeasure',
             'ROUGE-2-precision',
             'ROUGE-2-recall',
             'ROUGE-2-fmeasure',
             'ROUGE-L-precision',
             'ROUGE-L-recall',
             'ROUGE-L-fmeasure',
             'METEOR-precision',
             'METEOR-recall',
             'METEOR-f1',
             'METEOR-fMean',
             'METEOR-fragmentation-penalty',
             'METEOR-score'])

texts_formatted = [f"generate question: {text} </s>" 
                    for text in context_list]
context_dataset = ListDataset(texts_formatted)

questions_gold_ref = [" ".join(question) for question in gold_reference_list] if qg_type == "answer-agnostic" else gold_reference_list
set_seed(42) # for reproduceability

for model_path in model_paths_to_evaluate:
    generation_pipeline = pipeline(
        "text2text-generation", 
        model=model_path,
        tokenizer=model_path, 
        device=device)
    generated_questions = []
    print("Generating question with model:", generation_pipeline.model.name_or_path)
    print(generation_pipeline.model.config)
    with tqdm(generation_pipeline(context_dataset,
                                  batch_size=6,
                                  **generation_args), 
                total=len(context_dataset)) as pbar:
      for out in pbar:
          pbar.set_description("Generating questions")
          generated_text = out[0]['generated_text']
          if qg_type == "answer-agnostic":
              generated_text = generated_text.replace("<sep>", "")
          generated_questions.append(generated_text)
          torch.cuda.empty_cache()
          gc.collect()

    questions_hypothesis = generated_questions

    # for each model tested generate one directory 
    # with hypothesis and gold_reference.txt
    # to reproduce results or make them reproduceable
    eval_result_path = generated_output_path / pathlib.PurePath(model_path).name
    eval_result_path.mkdir(parents=True, exist_ok=True)
    hyp_path = eval_result_path / f"xquad_hypothesis.txt"
    ref_path = eval_result_path / f"xquad_gold_reference.txt"
    write_to_file(str(hyp_path), questions_hypothesis)
    write_to_file(str(ref_path), questions_gold_ref)

    bleu_result = calc_bleu(questions_hypothesis, questions_gold_ref)
    rouge_result = calc_rouge(questions_hypothesis, questions_gold_ref)
    meteor_result = calc_meteor(str(hyp_path), str(ref_path))
    bert_score_result = calc_bert_score(questions_hypothesis, questions_gold_ref)
    results_dict = {
        'model_path': model_path, 
        'BLEU_score': bleu_result['BLEU_score'],
        'BLEU-1': bleu_result['BLEU-1'],
        'BLEU-2': bleu_result['BLEU-2'],
        'BLEU-3': bleu_result['BLEU-3'],
        'BLEU-4': bleu_result['BLEU-4'],
        'BLEU-brevity-penalty': bleu_result['BLEU-brevity-penalty'],
        'BLEU-hyp-ref-ratio': bleu_result['BLEU-hyp-ref-ratio'],
        'BLEU-hyp-len': bleu_result['BLEU-hyp-len'],
        'BLEU-ref-len': bleu_result['BLEU-ref-len'],
        'ROUGE-1-precision': rouge_result['ROUGE-1-precision'],
        'ROUGE-1-recall': rouge_result['ROUGE-1-recall'],
        'ROUGE-1-fmeasure': rouge_result['ROUGE-1-fmeasure'],
        'ROUGE-2-precision': rouge_result['ROUGE-2-precision'],
        'ROUGE-2-recall': rouge_result['ROUGE-2-recall'],
        'ROUGE-2-fmeasure': rouge_result['ROUGE-2-fmeasure'],
        'ROUGE-L-precision': rouge_result['ROUGE-L-precision'],
        'ROUGE-L-recall': rouge_result['ROUGE-L-recall'],
        'ROUGE-L-fmeasure': rouge_result['ROUGE-L-fmeasure'],
        'METEOR-precision': meteor_result['METEOR-precision'],
        'METEOR-recall': meteor_result['METEOR-recall'],
        'METEOR-f1': meteor_result['METEOR-f1'],
        'METEOR-fMean': meteor_result['METEOR-fMean'],
        'METEOR-fragmentation-penalty': meteor_result['METEOR-fragmentation-penalty'],
        'METEOR-score': meteor_result['METEOR-score'],
        'BERT-score': bert_score_result['bertscore']
    }
    results_df = results_df.append(results_dict, ignore_index = True)
    # Memory cleanup
    del generation_pipeline
    gc.collect()
    torch.cuda.empty_cache()

Generating question with model: drive/MyDrive/mt5/M-2-mt5-base-e2e-qg-squad
MT5Config {
  "_name_or_path": "drive/MyDrive/mt5/M-2-mt5-base-e2e-qg-squad",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 250112
}



  0%|          | 0/240 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Generating question with model: drive/MyDrive/mt5/mt5-base-e2e-qg-squad-20ep-adamw
MT5Config {
  "_name_or_path": "drive/MyDrive/mt5/mt5-base-e2e-qg-squad-20ep-adamw",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 250112
}



  0%|          | 0/240 [00:00<?, ?it/s]

Generating question with model: drive/MyDrive/mt5/mt5-base-e2e-qg-squad-lr5e-4
MT5Config {
  "_name_or_path": "drive/MyDrive/mt5/mt5-base-e2e-qg-squad-lr5e-4",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "use_cache": true,
  "vocab_size": 250112
}



  0%|          | 0/240 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/195 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Generating question with model: valhalla/t5-base-e2e-qg
T5Config {
  "_name_or_path": "valhalla/t5-base-e2e-qg",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 1.5,
      "max_length": 256,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "generate questions: "
    },
    "translation_en_to_de": {
      "early_stopping": true,


  0%|          | 0/240 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (657 > 512). Running this sequence through the model will result in indexing errors


In [None]:
import time
timestr = time.strftime("%d%m%Y-%H%M%S")
results_df.to_excel(str(generated_output_path / f"{timestr}_results.xlsx"))

In [None]:
from google.colab import data_table
data_table.DataTable(results_df, include_index=False, max_columns=26)

Unnamed: 0,model_path,BLEU_score,BLEU-1,BLEU-2,BLEU-3,BLEU-4,BLEU-brevity-penalty,BLEU-hyp-ref-ratio,BLEU-hyp-len,BLEU-ref-len,...,ROUGE-L-precision,ROUGE-L-recall,ROUGE-L-fmeasure,METEOR-precision,METEOR-recall,METEOR-f1,METEOR-fMean,METEOR-fragmentation-penalty,METEOR-score,BERT-score
0,drive/MyDrive/mt5/M-2-mt5-base-e2e-qg-squad,7.179412,56.190912,26.068376,14.906219,9.078452,0.340251,0.481215,6558,13628,...,0.44278,0.227468,0.286585,0.4407716941341872,0.2104830267683564,0.2849114403857447,0.228381279247766,0.5472476169591376,0.1034001684213467,0.295765
1,drive/MyDrive/mt5/mt5-base-e2e-qg-squad-20ep-a...,7.229403,57.479203,27.842114,16.228145,10.015926,0.320117,0.467493,6371,13628,...,0.459629,0.225307,0.288365,0.4630692751763951,0.2153731120083745,0.2940048344681063,0.2341610552109434,0.545865641409702,0.1063405806150491,0.306331
2,drive/MyDrive/mt5/mt5-base-e2e-qg-squad-lr5e-4,7.441797,55.666122,25.797012,14.648968,8.764344,0.359134,0.494056,6733,13628,...,0.440987,0.229071,0.288002,0.4399637380071013,0.2172274562584118,0.2908505302175813,0.2350791511310676,0.5484168079157548,0.1061577934602222,0.298727
3,valhalla/t5-base-e2e-qg,13.591,52.840451,24.958712,13.91829,8.253692,0.688883,0.7285,9928,13628,...,0.396734,0.297443,0.323297,0.4137357242519766,0.2991102138477643,0.3472070873753173,0.3120794591390021,0.5472233477376915,0.1413022927487892,0.358147


In [None]:
!zip -r /content/evaluation.zip /content/evaluation


  adding: content/evaluation/ (stored 0%)
  adding: content/evaluation/mt5-base-e2e-qg-squad-lr5e-4/ (stored 0%)
  adding: content/evaluation/mt5-base-e2e-qg-squad-lr5e-4/xquad_gold_reference.txt (deflated 65%)
  adding: content/evaluation/mt5-base-e2e-qg-squad-lr5e-4/xquad_hypothesis.txt (deflated 60%)
  adding: content/evaluation/29062022-200926_results.xlsx (deflated 8%)
  adding: content/evaluation/M-2-mt5-base-e2e-qg-squad/ (stored 0%)
  adding: content/evaluation/M-2-mt5-base-e2e-qg-squad/xquad_gold_reference.txt (deflated 65%)
  adding: content/evaluation/M-2-mt5-base-e2e-qg-squad/xquad_hypothesis.txt (deflated 60%)
  adding: content/evaluation/t5-base-e2e-qg/ (stored 0%)
  adding: content/evaluation/t5-base-e2e-qg/xquad_gold_reference.txt (deflated 65%)
  adding: content/evaluation/t5-base-e2e-qg/xquad_hypothesis.txt (deflated 61%)
  adding: content/evaluation/mt5-base-e2e-qg-squad-20ep-adamw/ (stored 0%)
  adding: content/evaluation/mt5-base-e2e-qg-squad-20ep-adamw/xquad_gold_

In [None]:
from google.colab import files
files.download("/content/evaluation.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

German answer-agnostic models

|model\_path|BLEU\_score|BLEU-1|BLEU-2|BLEU-3|BLEU-4|BLEU-brevity-penalty|BLEU-hyp-ref-ratio|BLEU-hyp-len|BLEU-ref-len|ROUGE-1-precision|ROUGE-1-recall|ROUGE-1-fmeasure|ROUGE-2-precision|ROUGE-2-recall|ROUGE-2-fmeasure|ROUGE-L-precision|ROUGE-L-recall|ROUGE-L-fmeasure|METEOR-precision|METEOR-recall|METEOR-f1|METEOR-fMean|METEOR-fragmentation-penalty|METEOR-score|BERT-score|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|drive/MyDrive/mt5/M-1-mt5-base-e2e-qg-germanquad|1\.7283780471679064|49\.210584834334|16\.960300681230915|7\.144635299975106|3\.230076780513635|0\.14671262762175402|0\.34255027422303475|4497|13128|0\.4490004408992204|0\.16331239643420603|0\.2311977049001755|0\.15195704760168421|0\.051718694414973024|0\.0746574903742914|0\.3339896480586514|0\.12086889131216433|0\.17113000559087282|0\.3956829821566381|0\.13454831457103592|0\.20081228204437784|0\.1391396395927747|0\.39984816276951113|0\.0835049103331918|0\.3319405864943595|
|drive/MyDrive/mt5/M-2-mt5-base-e2e-qg-squad|0\.8331015947116223|27\.671947062362143|5\.793132504739836|2\.1965830929664967|0\.9374267635340989|0\.19545123747982582|0\.3798750761730652|4987|13128|0\.20624197788166604|0\.08625756421192557|0\.11688739978641705|0\.05296744448179816|0\.020594845519977728|0\.028531628871381873|0\.18081483963142742|0\.07491662689214021|0\.10166673112873459|0\.16251533058311962|0\.058017207820581056|0\.08550834064414925|0\.05994443553535369|0\.4183427762039661|0\.034867113955514145|0\.22442210963539158|
|drive/MyDrive/mt5/M-3-mt5-base-e2e-qg-translated-squad|4\.179212969719485|46\.38675142188023|17\.67166260020913|8\.966897053473991|4\.96386458729555|0\.30238561919841705|0\.4553625837903717|5978|13128|0\.4315479484990101|0\.20365891909505857|0\.266041293879649|0\.1687540811634413|0\.07343116241166396|0\.0983097654530843|0\.32178690349780303|0\.15197113772627338|0\.19863826987368294|0\.38027355037565014|0\.1726962891168082|0\.23752409737003047|0\.177541968705351|0\.3620089390731593|0\.11327018897336683|0\.33474112004041673|
|drive/MyDrive/mt5/M-4-mt5-base-e2e-qg-squad-germanquad|1\.9789401607579706|50\.42581801882564|18\.664140217906205|8\.689100954294325|4\.436130411544629|0\.14339124648275428|0\.3398842169408897|4462|13128|0\.4664806512804891|0\.1720457663668034|0\.24160928410935173|0\.17129815699320786|0\.05854645475322953|0\.08409747909522067|0\.3500948799700978|0\.1285056216306233|0\.1807993672578358|0\.4172473622508793|0\.1413694755825316|0\.21118604665902335|0\.1462028349739423|0\.3761558592614132|0\.09120778195786444|0\.3388309199362993|
|drive/MyDrive/mt5/M-5-mt5-base-e2e-qg-translated-squad-germanquad|2\.2860128589886783|50\.64088637844884|18\.93192757277103|9\.168081494057725|5\.124903425186711|0\.15691468814917578|0\.35062461913467396|4603|13128|0\.46715032250473354|0\.17824209063101037|0\.2480626436526392|0\.17247644318859737|0\.06239705883276431|0\.08790312441924|0\.3590601385567731|0\.1365783171089211|0\.18993609163018305|0\.41805536553108036|0\.1457786816870973|0\.21617552313493454|0\.15068571186937257|0\.3745996066310762|0\.09423890347818192|0\.3408313267243405|
|drive/MyDrive/mt5/M-6-mt5-base-e2e-qg-squad+germanquad|2\.1029557452586967|51\.52877697841727|19\.866920152091254|9\.299395161290322|5\.042918454935623|0\.14206950789555958|0\.3388177940280317|4448|13128|0\.4719418171821641|0\.1721423559065412|0\.24336900780297993|0\.17538075508450796|0\.05973508402694738|0\.08585330002300687|0\.35707804160951306|0\.12931340546753886|0\.18286268068820188|0\.4210748606003084|0\.1412439522375728|0\.21153223453051806|0\.1460985387535659|0\.37771660388293254|0\.09091469486331|0\.34492006301879885|
|drive/MyDrive/mt5/M-7-mt5-base-e2e-qg-translated-squad+germanquad|3\.8147711962811686|48\.64768683274021|18\.773234200743495|9\.474708171206226|5\.122448979591836|0\.26291011836078065|0\.42809262644728824|5620|13128|0\.45239547337228797|0\.20796463071714766|0\.2733349802920817|0\.17488717848677665|0\.07567920848312078|0\.10094225603441476|0\.3374193247101078|0\.1526341674056758|0\.20148081887415376|0\.3989006074609623|0\.1703796683959229|0\.23877360978238774|0\.17540391073178221|0\.36148875059837243|0\.11199737019128182|0\.3379837230158349|
|drive/MyDrive/mt5/mt5-base-e2e-qg-squad+translated-squad+germanquad|3\.6486207040693523|49\.37865497076023|19\.189602446483182|9\.815705128205128|5\.134680134680135|0\.24681337196979636|0\.41681901279707495|5472|13128|0\.45884529782724304|0\.2030384035756327|0\.27002800969161656|0\.18181184064403705|0\.07656601463941973|0\.10307700156602087|0\.34323780972552265|0\.15192477298178708|0\.2023822368494665|0\.4145655909872064|0\.17276937865944247|0\.24389570950129064|0\.17795912962129976|0\.36054549729602636|0\.11379676673362021|0\.3482872312888503|

English answer-agnostic models

|model\_path|BLEU\_score|BLEU-1|BLEU-2|BLEU-3|BLEU-4|BLEU-brevity-penalty|BLEU-hyp-ref-ratio|BLEU-hyp-len|BLEU-ref-len|ROUGE-1-precision|ROUGE-1-recall|ROUGE-1-fmeasure|ROUGE-2-precision|ROUGE-2-recall|ROUGE-2-fmeasure|ROUGE-L-precision|ROUGE-L-recall|ROUGE-L-fmeasure|METEOR-precision|METEOR-recall|METEOR-f1|METEOR-fMean|METEOR-fragmentation-penalty|METEOR-score|BERT-score|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|drive/MyDrive/mt5/M-2-mt5-base-e2e-qg-squad|7\.179411736231856|56\.19091186337298|26\.068376068376068|14\.906219151036526|9\.07845152449469|0\.34025069899192967|0\.48121514528911064|6558|13628|0\.5747979310830673|0\.2974310382807994|0\.3738986772436867|0\.27301785201344947|0\.13194596878922554|0\.16988433132944183|0\.4427798568453755|0\.22746766635049492|0\.28658510722836616|0\.44077169413418726|0\.21048302676835648|0\.28491144038574473|0\.22838127924776605|0\.5472476169591376|0\.10340016842134674|0\.29576503762121625|
|drive/MyDrive/mt5/mt5-base-e2e-qg-squad-20ep-adamw|7\.229403055042123|57\.47920263694867|27\.842113847659437|16\.228144627397725|10\.01592638471067|0\.32011734403713343|0\.4674933959495157|6371|13628|0\.5874168235389795|0\.291274022060941|0\.3714555073127584|0\.28863765670524966|0\.13226504776061276|0\.17284837233703743|0\.45962903201101324|0\.2253067312711045|0\.2883653175881331|0\.4630692751763951|0\.2153731120083745|0\.2940048344681063|0\.2341610552109434|0\.545865641409702|0\.10634058061504915|0\.30633143344894054|
|drive/MyDrive/mt5/mt5-base-e2e-qg-squad-lr5e-4|7\.441797389080008|55\.66612208525174|25\.79701216694902|14\.648968495122341|8\.764343921503409|0\.35913367822523945|0\.49405635456413266|6733|13628|0\.571096904482407|0\.29923262654711436|0\.3752198978371245|0\.26822089004334193|0\.13046076702435921|0\.16791333862512436|0\.4409868921989193|0\.22907100869341146|0\.2880020800196138|0\.43996373800710137|0\.21722745625841183|0\.2908505302175813|0\.23507915113106762|0\.5484168079157548|0\.10615779346022222|0\.29872712910485766|
|valhalla/t5-base-e2e-qg|13\.590999583924525|52\.84045124899275|24\.95871180842279|13\.918289585097375|8\.253692441355343|0\.6888833539519681|0\.7285001467566774|9928|13628|0\.5364232603438603|0\.4076001555812505|0\.4404386861756581|0\.2531991222307374|0\.18162612680181217|0\.20109494918329512|0\.3967335283759914|0\.29744330473537645|0\.32329713548854766|0\.41373572425197663|0\.2991102138477643|0\.3472070873753173|0\.3120794591390021|0\.5472233477376915|0\.14130229274878925|0\.358147108516035|

German answer-aware model

|model\_path|BLEU\_score|BLEU-1|BLEU-2|BLEU-3|BLEU-4|BLEU-brevity-penalty|BLEU-hyp-ref-ratio|BLEU-hyp-len|BLEU-ref-len|ROUGE-1-precision|ROUGE-1-recall|ROUGE-1-fmeasure|ROUGE-2-precision|ROUGE-2-recall|ROUGE-2-fmeasure|ROUGE-L-precision|ROUGE-L-recall|ROUGE-L-fmeasure|METEOR-precision|METEOR-recall|METEOR-f1|METEOR-fMean|METEOR-fragmentation-penalty|METEOR-score|BERT-score|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|drive/MyDrive/mt5/M-10-mt5-base-hl-qg-germanquad|8\.93069461067753|37\.533812144389515|13\.209526807260518|7\.265315909363386|4\.33505803384142|0\.7989054794723728|0\.8166514320536259|10721|13128|0\.31489280094322125|0\.2722019264464105|0\.28170938012151525|0\.14115988353383313|0\.1244837559924781|0\.12776305566069682|0\.2959205943449643|0\.2575346418293563|0\.2657151498872775|0\.27002471460515476|0\.21903188134071644|0\.24186984385342242|0\.22111974995627676|0\.3258737419945106|0\.14906262960913433|0\.469804469276877|