In [1]:
import evaluate
from datasets import *
import numpy as np
import pathlib as pl
import pandas as pd
import torch
import ast
import os
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, TransformersReader
from haystack.pipelines import Pipeline

YEAR = 2022
DPR_FINE_TUNE = False

MODEL_PATH = "../../data/models/BERT"

In [2]:
generated_data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{YEAR}-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=42)

generated_data["test"] = generated_data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
generated_data["test"] = generated_data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
generated_data["test"] = generated_data["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
generated_data["test"].remove_columns(["text", "answer_start"])

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-a8af1b4c8d81fb1c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-7d8841dd72615495.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-8b1142ae44904f4e.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-36096c12f970e2cd.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-a8af1b4c8d81fb1c\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-b621ad6a176502d0.arrow
Loading cached process

Dataset({
    features: ['index', 'question', 'context', 'answers', 'id'],
    num_rows: 107
})

In [3]:
written_data = load_dataset('csv', data_files=f"../../data/clean/sr-2022-questions-answers-ALL-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=42)
written_data["test"] = written_data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))

written_data["test"] = written_data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
written_data["test"] = written_data["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
written_data["test"].remove_columns(["text", "answer_start"])

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-54c2d14257a19ede/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-54c2d14257a19ede\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3ff29462153ffd09.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-54c2d14257a19ede\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-8202b31bf7f0cc8f.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-54c2d14257a19ede\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-68763aea809abd97.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-54c2d14257a19ede\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-50d87829045139cf.arrow
Loading cached process

Dataset({
    features: ['question', 'context', 'answers'],
    num_rows: 21
})

In [4]:
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")
squad_v2 = evaluate.load("squad_v2")

In [5]:
document_store = FAISSDocumentStore.load(index_path="document_store.faiss", config_path="document_store.json")

In [6]:
if DPR_FINE_TUNE:
    retriever = DensePassageRetriever.load(load_dir=f"../../data/models/DPR/{YEAR}", document_store=document_store, use_gpu=True)
else: 
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True
    )

  return self.fget.__get__(instance, owner)()
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [7]:
# Set the device
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define the prediction function
def inference_answer(model, question):
    question = question
    with torch.no_grad():
        outputs = model.run(query=question, params={"Model": {"top_k": 3}})
    return outputs["answers"][0].answer

In [8]:
models = os.listdir(MODEL_PATH)

In [9]:
results = {
    "Model": [],
    "Data": [],
    "Bert.Precision": [],
    "Bert.Recall": [],
    "Bert.F1": [],
    "BLEU": [],
    "Squad.Exact": [],
    "Squad.F1": []
}

In [10]:
generated_answer_true = [generated_data["test"]["answers"][idx]["text"][0] for idx in range(generated_data["test"].shape[0])]
generated_squad_references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [
    answer["text"][0]]}, "id": str(id)} for id, answer in zip(generated_data["test"]["id"], generated_data["test"]["answers"])]

written_answer_true = [written_data["test"]["answers"][idx]["text"][0] for idx in range(written_data["test"].shape[0])]
written_squad_references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [
    answer["text"][0]]}, "id": str(id)} for id, answer in enumerate(written_data["test"]["answers"])]

In [11]:
generator = None
pipe = None

for model in models:
    model_data = model.split("-")
    year = model_data[-2]

    if int(year) != YEAR:
        continue

    print(f"Evaluating model {model}...")
    model_name = "-".join(model_data[:2])
    data_name = model_data[-1]

    reader = TransformersReader(model_name_or_path=f"{MODEL_PATH}/{model}", use_gpu=True)
    pipe = Pipeline()
    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
    pipe.add_node(component=reader, name="Model", inputs=["Retriever"])

    results["Model"].append(model_name)
    results["Data"].append(data_name)

    answer_pred = [inference_answer(pipe, generated_data["test"]["question"][idx]) for idx in range(generated_data["test"].shape[0])]
    squad_predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.}
               for id, answer in zip(generated_data["test"]["id"], answer_pred)]

    if "handwritten" != data_name:
        answer_pred = [inference_answer(pipe, generated_data["test"]["question"][idx]) for idx in range(generated_data["test"].shape[0])]
        squad_predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.}
               for id, answer in zip(generated_data["test"]["id"], answer_pred)]

        results_1 = bertscore.compute(predictions=answer_pred, references=generated_answer_true, lang="en")
        results_2 = bleu.compute(predictions=answer_pred, references=generated_answer_true)
        results_3 = squad_v2.compute(predictions=squad_predictions, references=generated_squad_references)
    else:
        answer_pred = [inference_answer(pipe, written_data["test"]["question"][idx]) for idx in range(written_data["test"].shape[0])]
        squad_predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.}
               for id, answer in enumerate(answer_pred)]

        results_1 = bertscore.compute(predictions=answer_pred, references=written_answer_true, lang="en")
        results_2 = bleu.compute(predictions=answer_pred, references=written_answer_true)
        results_3 = squad_v2.compute(predictions=squad_predictions, references=written_squad_references)

    results["Bert.Precision"].append(np.array(results_1["precision"]).mean())
    results["Bert.Recall"].append(np.array(results_1["recall"]).mean())
    results["Bert.F1"].append(np.array(results_1["f1"]).mean())
    results["BLEU"].append(results_2["bleu"])
    results["Squad.Exact"].append(results_3["exact"])
    results["Squad.F1"].append(results_3["f1"])

    del reader
    del pipe
    torch.cuda.empty_cache()

Evaluating model distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-full...




Evaluating model distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-handwritten...
Evaluating model distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-smaller...
Evaluating model roberta-base-squad2-finetuned-NLB-QA-2022-full...
Evaluating model roberta-base-squad2-finetuned-NLB-QA-2022-handwritten...
Evaluating model roberta-base-squad2-finetuned-NLB-QA-2022-smaller...


In [12]:
results_df = pd.DataFrame(results)

## Base DPR

In [13]:
results_df

Unnamed: 0,Model,Data,Bert.Precision,Bert.Recall,Bert.F1,BLEU,Squad.Exact,Squad.F1
0,distilbert-base,full,0.916814,0.91938,0.917961,0.152191,33.64486,37.349696
1,distilbert-base,handwritten,0.869796,0.869352,0.869316,0.0,4.761905,11.579288
2,distilbert-base,smaller,0.91901,0.91795,0.918348,0.160782,32.71028,36.56562
3,roberta-base,full,0.9258,0.923658,0.92451,0.219565,37.383178,41.509487
4,roberta-base,handwritten,0.881454,0.873261,0.876277,0.046133,4.761905,21.530064
5,roberta-base,smaller,0.925033,0.9229,0.923744,0.207498,37.383178,41.447856


## DPR Finetunned

![image.png](attachment:image.png)