# Install

In [None]:
%%capture
!pip install transformers datasets pandas sentencepiece numpy tqdm datasets sentencepiece sacrebleu rouge_score bert_score

In [None]:
!nvidia-smi

Wed Jun 29 16:45:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Utility

In [None]:
import requests
from pathlib import Path
from tqdm.auto import tqdm

def download_from_url(url: str) -> str:
    request = requests.get(
        url, 
        allow_redirects=True, 
        stream=True)
    total_size_in_bytes= int(request.headers.get('content-length', 0))
    block_size = 2**10 # 1 Kibibyte

    filename = url.split("/")[-1]

    with tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) as progress_bar:
        with open(filename, 'wb') as f:
            for data in request.iter_content(block_size):
                progress_bar.update(len(data))
                f.write(data)

    return filename

In [None]:
import tarfile
import zipfile

def unpack_download(filename: str) -> None:
    if ".tar.gz" in filename:
        with tarfile.open(filename, 'r:gz') as tar_ref:
            for file in tqdm(iterable=tar_ref.getmembers(), total=len(tar_ref.getmembers())):
                tar_ref.extract(member=file)
    elif ".zip" in filename:
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            for file in tqdm(iterable=zip_ref.namelist(), total=len(zip_ref.namelist())):
                zip_ref.extract(member=file)
    else:
        raise ValueError(f"Unknown file extension '{filename}'.")

# Download

In [None]:
xquad_download_link = "https://github.com/deepmind/xquad/archive/refs/heads/master.zip" # From https://github.com/deepmind/xquad
print("Downloading", xquad_download_link)
filename = download_from_url(xquad_download_link)
print("Unpacking", filename)
unpack_download(filename)

Downloading https://github.com/deepmind/xquad/archive/refs/heads/master.zip


0.00iB [00:00, ?iB/s]

Unpacking master.zip


  0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
meteor = 'http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz'
print("Downloading", meteor)
filename = download_from_url(meteor)
print("Unpacking", meteor)
unpack_download(filename)

Downloading http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz


  0%|          | 0.00/224M [00:00<?, ?iB/s]

Unpacking http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz


  0%|          | 0/161 [00:00<?, ?it/s]

# Data transformation

In [None]:
from pathlib import Path

xquad_path = Path("xquad-master")
xquad = xquad_path / "xquad.de.json"

In [None]:
import json

def generate_examples(filepath):
    with open(filepath, encoding="utf-8") as f:
        squad = json.load(f)
        for article in squad["data"]:
            title = article.get("title", "")
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].split("==\n", 1)[-1].replace("\n", " ").replace("''", "'")  # do not strip leading blank spaces GH-2585
                questions = [qa["question"].strip() for qa in paragraph["qas"]]
                yield {
                    "input_text": context,
                    "target_text": questions,
                }

In [None]:
import pandas as pd

dataset = df_train = df_test =  None
df_eval = pd.DataFrame(list(generate_examples(str(xquad.resolve()))))

In [None]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict({
  "eval": Dataset.from_pandas(df_eval),
})

from google.colab import data_table
data_table.DataTable(df_eval, include_index=False, num_rows_per_page=5)

Unnamed: 0,input_text,target_text
0,Die Verteidigung der Panthers gab nur 308 Punk...,[Wie viele Punkte gab die Verteidigung der Pan...
1,Die Broncos besiegten die Pittsburgh Steelers ...,[Wer verlor in der Divisional Round gegen die ...
2,"Peyton Manning wurde zum ersten Quarterback, d...","[Wie alt war Peyton Manning, als er im Super B..."
3,Die sechs-malige Grammy-Gewinnerin und Oscar-N...,"[Wie viele Grammys gewann Lady Gaga?, Was hat ..."
4,Mit 4:51 verbleibenden Minuten in der reguläre...,[Auf welcher Yard Line begann Carolina mit 4:5...
...,...,...
235,Aristoteles lieferte eine philosophische Disku...,[Wer leitete eine philosophische Diskussion üb...
236,Die Entwicklung grundlegender Theorien für Krä...,[Wer hat die universelle Theorie der Gravitati...
237,Seitdem und bis heute gilt die allgemeine Rela...,[Welche Theorie erklärt die Schwerkraft am bes...
238,Durch die Kombination der Definition von elekt...,[Wie hoch ist die Zeitspanne der Änderung der ...


In [None]:
import os

gold_reference = []

for index, row in df_eval.iterrows():
  gold_reference.append(" ".join(row['target_text']))

print(gold_reference[0])
try:
  with open("gold_reference.txt", 'w') as f:
      f.write("\n".join(gold_reference))
except FileExistsError as err:
  print(err)

Wie viele Punkte gab die Verteidigung der Panthers ab? Wie viele Sacks erzielte Jared Allen in seiner Karriere? Wie viele Tackles wurden bei Luke Kuechly registriert? Wie viele Bälle fing Josh Norman ab? Wer hatte in dieser Saison die meisten Sacks im Team? Wie viele Interceptions wurden der Verteidigung der Panthers im Jahr 2015 angerechnet? Wer führte die Panther bei den Sacks an? Wie viele Verteidigungsspieler der Panthers wurden für den Pro Bowl ausgewählt? Wie viele erzwungene Fumbles hatte Thomas Davis? Welcher Spieler hatte die meisten Interceptions der Saison? Wie viele Interceptions erzielte die Verteidigung der Panthers in der Saison 2015? Wer erzielte als Carolina Panthers-Starter fünf Sacks in neun Spielen? Wer führte die Panthers 2015 bei den Tackles an? Mit wie vielen Interceptions erzielte Josh Norman im Jahr 2015 Touchdowns?


# Load models

In [None]:
import torch

# device (int, optional, defaults to -1) — Device ordinal for CPU/GPU supports. 
# Setting this to -1 will leverage CPU, a positive will run the model on the 
# associated CUDA device id.
device = 0 if torch.cuda.is_available else -1
print(device)

0


In [None]:
from transformers import pipeline, AutoTokenizer

de_en_model = "Helsinki-NLP/opus-mt-de-en"
qg_e2e_model = "valhalla/t5-base-e2e-qg"


de_en_pipeline = pipeline("text2text-generation", model=de_en_model, tokenizer=de_en_model, device=device)
# https://github.com/huggingface/transformers/blob/9e71d4645526911f2ea9743aa4cf8e9d479fc840/src/transformers/pipelines/__init__.py#L214
en_de_pipeline = pipeline("translation_en_to_de", device=device)
# print(en_de_pipeline.model.name_or_path) # t5-base
en_qg_pipeline = pipeline("text2text-generation", model=qg_e2e_model, tokenizer=qg_e2e_model, device=device)

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/284M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/750k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

No model was supplied, defaulted to t5-base (https://huggingface.co/t5-base)


Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/195 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

# Question generation

In [None]:
context_list = list(df_eval["input_text"])
df_eval["input_text"]

0      Die Verteidigung der Panthers gab nur 308 Punk...
1      Die Broncos besiegten die Pittsburgh Steelers ...
2      Peyton Manning wurde zum ersten Quarterback, d...
3      Die sechs-malige Grammy-Gewinnerin und Oscar-N...
4      Mit 4:51 verbleibenden Minuten in der reguläre...
                             ...                        
235    Aristoteles lieferte eine philosophische Disku...
236    Die Entwicklung grundlegender Theorien für Krä...
237    Seitdem und bis heute gilt die allgemeine Rela...
238    Durch die Kombination der Definition von elekt...
239    Wo ist die relevante Querschnittsfläche für da...
Name: input_text, Length: 240, dtype: object

de -> en

In [None]:
texts_en = de_en_pipeline(context_list, truncation=True, max_length=512)
texts_en = [text['generated_text'] for text in texts_en]

Question generation

In [None]:
texts_en_formatted = [f"generate question: {text} </s>" for text in texts_en]
generated_questions_en = en_qg_pipeline(texts_en_formatted,
                                        batch_size=6,
                                        max_length=128,
                                        num_beams=4,
                                        length_penalty=1.5,
                                        no_repeat_ngram_size=3,
                                        early_stopping=True)
generated_questions_en = [text['generated_text'] for text in generated_questions_en] # extract questions
generated_questions_en = [text.split("<sep>") for text in generated_questions_en] # split questions into lists
context_questions = []
for question_list in generated_questions_en:
    context_questions.append([question.strip() for question in question_list if question])

Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors


en -> de

In [None]:
questions_de_list = []
for question_list in context_questions:
    questions_de_list.append([text['translation_text'] for text in en_de_pipeline(question_list)])



In [None]:
predictions = [question_list for question_list in questions_de_list]

In [None]:
p = [" ".join(pred) for pred in predictions]

In [None]:
with open("hypothesis.txt", 'w') as f:
    f.write("\n".join(p))

# Auxiliary functions

In [None]:
from sacrebleu.metrics import BLEU
from typing import List, Dict, Any
import pandas as pd

def calc_bleu(hyp: List[str], refs: List[str]) -> Dict[str, Any]:
    assert len(hyp) == len(refs)
    bleu = BLEU(lowercase=True)
    result = bleu.corpus_score(hyp, [refs]) # list of list refs
    return {
        "BLEU_score": result.score,
        "BLEU-1": result.precisions[0],
        "BLEU-2": result.precisions[1],
        "BLEU-3": result.precisions[2],
        "BLEU-4": result.precisions[3],
        "BLEU-brevity-penalty": result.bp,
        "BLEU-hyp-ref-ratio": result.sys_len / result.ref_len,
        "BLEU-hyp-len": result.sys_len,
        "BLEU-ref-len": result.ref_len,
        "counts": result.counts,
        "totals": result.totals,
        "precisions": result.precisions,
    }

In [None]:
import datasets
from rouge_score import tokenize
import pandas as pd

def calc_rouge(hyp: List[str], refs: List[str]) -> Dict[str, Any]:
    assert len(hyp) == len(refs)
    def replace_umlaute(text: str) -> str:
      return (text.replace("ä", "ae")
                  .replace("ö", "oe")
                  .replace("ü", "ue")
                  .replace("ß", "ss"))

    hyp_no_umlaute = [replace_umlaute(h) for h in hyp]
    refs_no_umlaute = [replace_umlaute(r) for ref in [refs] for r in ref]

    metric = datasets.load_metric('rouge')
    assert len(hyp_no_umlaute) == len(refs_no_umlaute)
    metric.add_batch(predictions=hyp_no_umlaute, references=refs_no_umlaute)

    result = metric.compute(
        rouge_types=['rouge1', 'rouge2', 'rougeL'],
        use_aggregator=True,
        use_stemmer=False) # Porter-Stemmer (only english)
    return {
        'ROUGE-1-precision': result['rouge1'].mid.precision,
        'ROUGE-1-recall': result['rouge1'].mid.recall,
        'ROUGE-1-fmeasure': result['rouge1'].mid.fmeasure,
        'ROUGE-2-precision': result['rouge2'].mid.precision,
        'ROUGE-2-recall': result['rouge2'].mid.recall,
        'ROUGE-2-fmeasure': result['rouge2'].mid.fmeasure,
        'ROUGE-L-precision': result['rougeL'].mid.precision,
        'ROUGE-L-recall': result['rougeL'].mid.recall,
        'ROUGE-L-fmeasure': result['rougeL'].mid.fmeasure,
    }

In [None]:
import subprocess

def calc_meteor(hyp_file_path: str, ref_file_path: str, langcode: str = 'de'):
    METEOR_JAR = 'meteor-1.5/meteor-1.5.jar'
    meteor_cmd = [
        "java",
        "-jar",
        "-Xmx2G",
        METEOR_JAR,
        hyp_file_path,
        ref_file_path,
        '-l',
        langcode
    ]
    process = subprocess.run(meteor_cmd, capture_output=True)
    output = process.stdout.decode("utf-8")
    splitted_output = output.split("\n")
    test = [output.split(":") for output in splitted_output]
    output_dict = {}
    for item in test:
      if(len(item) == 2):
        output_dict.update({item[0].strip(): item[1].strip()})

    return {
        'METEOR-precision': output_dict['Precision'],
        'METEOR-recall': output_dict['Recall'],
        'METEOR-f1': output_dict['f1'],
        'METEOR-fMean': output_dict['fMean'],
        'METEOR-fragmentation-penalty': output_dict['Fragmentation penalty'],
        'METEOR-score': output_dict['Final score']
    }

In [None]:
from bert_score import BERTScorer
import numpy as np

scorer = BERTScorer(lang="de", rescale_with_baseline=True)

def calc_bert_score(hyp: List[str], refs: List[str]):
    P, R, F1 = scorer.score(hyp, refs)
    return {
        "bertscore": np.mean(F1.tolist())
    }

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# auto eval

In [None]:
def read_from_file(filename: str) -> List[str]:
    texts = []
    with open(filename) as file:
        for line in file:
            texts.append(line.strip())
        return texts

In [None]:
questions_hypothesis = read_from_file("hypothesis.txt")
questions_gold_ref = read_from_file("gold_reference.txt")

In [None]:
results_df = pd.DataFrame(
    columns=['model_path', 
             'BLEU_score',
             'BLEU-1',
             'BLEU-2',
             'BLEU-3',
             'BLEU-4',
             'BLEU-brevity-penalty',
             'BLEU-hyp-ref-ratio',
             'BLEU-hyp-len',
             'BLEU-ref-len',
             'ROUGE-1-precision',
             'ROUGE-1-recall',
             'ROUGE-1-fmeasure',
             'ROUGE-2-precision',
             'ROUGE-2-recall',
             'ROUGE-2-fmeasure',
             'ROUGE-L-precision',
             'ROUGE-L-recall',
             'ROUGE-L-fmeasure',
             'METEOR-precision',
             'METEOR-recall',
             'METEOR-f1',
             'METEOR-fMean',
             'METEOR-fragmentation-penalty',
             'METEOR-score'])

In [None]:
questions_hypothesis = read_from_file("hypothesis.txt")
questions_gold_ref = read_from_file("gold_reference.txt")
bleu_result = calc_bleu(questions_hypothesis, questions_gold_ref)
rouge_result = calc_rouge(questions_hypothesis, questions_gold_ref)
meteor_result = calc_meteor(str("hypothesis.txt"), str("gold_reference.txt"))
bert_score_result = calc_bert_score(questions_hypothesis, questions_gold_ref)
results_dict = {
    'model_path': "translation pipeline", 
    'BLEU_score': bleu_result['BLEU_score'],
    'BLEU-1': bleu_result['BLEU-1'],
    'BLEU-2': bleu_result['BLEU-2'],
    'BLEU-3': bleu_result['BLEU-3'],
    'BLEU-4': bleu_result['BLEU-4'],
    'BLEU-brevity-penalty': bleu_result['BLEU-brevity-penalty'],
    'BLEU-hyp-ref-ratio': bleu_result['BLEU-hyp-ref-ratio'],
    'BLEU-hyp-len': bleu_result['BLEU-hyp-len'],
    'BLEU-ref-len': bleu_result['BLEU-ref-len'],
    'ROUGE-1-precision': rouge_result['ROUGE-1-precision'],
    'ROUGE-1-recall': rouge_result['ROUGE-1-recall'],
    'ROUGE-1-fmeasure': rouge_result['ROUGE-1-fmeasure'],
    'ROUGE-2-precision': rouge_result['ROUGE-2-precision'],
    'ROUGE-2-recall': rouge_result['ROUGE-2-recall'],
    'ROUGE-2-fmeasure': rouge_result['ROUGE-2-fmeasure'],
    'ROUGE-L-precision': rouge_result['ROUGE-L-precision'],
    'ROUGE-L-recall': rouge_result['ROUGE-L-recall'],
    'ROUGE-L-fmeasure': rouge_result['ROUGE-L-fmeasure'],
    'METEOR-precision': meteor_result['METEOR-precision'],
    'METEOR-recall': meteor_result['METEOR-recall'],
    'METEOR-f1': meteor_result['METEOR-f1'],
    'METEOR-fMean': meteor_result['METEOR-fMean'],
    'METEOR-fragmentation-penalty': meteor_result['METEOR-fragmentation-penalty'],
    'METEOR-score': meteor_result['METEOR-score'],
    'BERT-score': bert_score_result['bertscore']
}
results_df = results_df.append(results_dict, ignore_index = True)

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
from google.colab import data_table
data_table.DataTable(results_df, include_index=False, max_columns=26)

Unnamed: 0,model_path,BLEU_score,BLEU-1,BLEU-2,BLEU-3,BLEU-4,BLEU-brevity-penalty,BLEU-hyp-ref-ratio,BLEU-hyp-len,BLEU-ref-len,...,ROUGE-L-precision,ROUGE-L-recall,ROUGE-L-fmeasure,METEOR-precision,METEOR-recall,METEOR-f1,METEOR-fMean,METEOR-fragmentation-penalty,METEOR-score,BERT-score
0,translation pipeline,7.50733,41.167633,14.83807,7.060421,3.779949,0.66439,0.709781,9318,13128,...,0.270554,0.2,0.218605,0.3233193633293043,0.2275531686700086,0.2671120498636325,0.2309738557270714,0.3733891064435742,0.1447307341253132,0.348075


In [None]:
import time
generated_output_path = Path("drive") / "MyDrive" / "evaluation"
timestr = time.strftime("%d%m%Y-%H%M%S")
results_df.to_excel(str(generated_output_path / f"{timestr}_results.xlsx"))
results_df.to_csv(str(generated_output_path / f"{timestr}_results.csv"))