In [1]:
import evaluate
from datasets import *
import numpy as np
import pathlib as pl
import pandas as pd
import torch
import ast
import os
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, TransformersReader
from haystack.pipelines import Pipeline

YEAR = 2042
DPR_FINE_TUNE = True

MODEL_PATH = "../../data/models/BERT"

In [2]:
generated_data = load_dataset('csv', data_files=f"../../data/clean/sustainability-report-{YEAR}-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=42)

generated_data["test"] = generated_data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
generated_data["test"] = generated_data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
generated_data["test"] = generated_data["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
generated_data["test"].remove_columns(["text", "answer_start"])

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-dc46deea403e6d7a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-8c4226c3883b9f9f.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-24d9770f6403af3d.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-47e77242f5a0d387.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-dc46deea403e6d7a\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-cb788cd2e517f2b5.arrow
Loading cached process

Dataset({
    features: ['question', 'context', 'answers', 'id'],
    num_rows: 162
})

In [3]:
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")
squad_v2 = evaluate.load("squad_v2")

In [4]:
document_store = FAISSDocumentStore.load(index_path="document_store.faiss", config_path="document_store.json")

In [5]:
if DPR_FINE_TUNE:
    retriever = DensePassageRetriever.load(load_dir=f"../../data/models/DPR/{YEAR}", document_store=document_store, use_gpu=True)
else: 
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=True
    )

  return self.fget.__get__(instance, owner)()


In [6]:
# Set the device
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define the prediction function
def inference_answer(model, question):
    question = question
    with torch.no_grad():
        outputs = model.run(query=question, params={"Model": {"top_k": 3}})
    return outputs["answers"][0].answer

In [7]:
models = os.listdir(MODEL_PATH)

In [8]:
results = {
    "Model": [],
    "Data": [],
    "Bert.Precision": [],
    "Bert.Recall": [],
    "Bert.F1": [],
    "BLEU": [],
    "Squad.Exact": [],
    "Squad.F1": []
}

In [9]:
generated_answer_true = [generated_data["test"]["answers"][idx]["text"][0] for idx in range(generated_data["test"].shape[0])]
squad_references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [
    answer["text"][0]]}, "id": str(id)} for id, answer in zip(generated_data["test"]["id"], generated_data["test"]["answers"])]

In [10]:
generator = None
pipe = None

for model in models:
    model_data = model.split("-")
    year = model_data[-2]

    if int(year) != YEAR:
        continue

    print(f"Evaluating model {model}...")
    model_name = "-".join(model_data[:2])
    data_name = model_data[-1]

    reader = TransformersReader(model_name_or_path=f"{MODEL_PATH}/{model}", use_gpu=True)
    pipe = Pipeline()
    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
    pipe.add_node(component=reader, name="Model", inputs=["Retriever"])

    results["Model"].append(model_name)
    results["Data"].append(data_name)

    answer_pred = [inference_answer(pipe, generated_data["test"]["question"][idx]) for idx in range(generated_data["test"].shape[0])]
    squad_predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.}
               for id, answer in zip(generated_data["test"]["id"], answer_pred)]

    results_1 = bertscore.compute(predictions=answer_pred, references=generated_answer_true, lang="en")
    results_2 = bleu.compute(predictions=answer_pred, references=generated_answer_true)
    results_3 = squad_v2.compute(predictions=squad_predictions, references=squad_references)

    results["Bert.Precision"].append(np.array(results_1["precision"]).mean())
    results["Bert.Recall"].append(np.array(results_1["recall"]).mean())
    results["Bert.F1"].append(np.array(results_1["f1"]).mean())
    results["BLEU"].append(results_2["bleu"])
    results["Squad.Exact"].append(results_3["exact"])
    results["Squad.F1"].append(results_3["f1"])

    del reader
    del pipe
    torch.cuda.empty_cache()

Evaluating model distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2042-full_combined...




Evaluating model distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2042-smaller_combined...
Evaluating model roberta-base-squad2-finetuned-NLB-QA-2042-full_combined...
Evaluating model roberta-base-squad2-finetuned-NLB-QA-2042-smaller_combined...


In [11]:
results_df = pd.DataFrame(results)

## DPR Finetunned

In [12]:
results_df

Unnamed: 0,Model,Data,Bert.Precision,Bert.Recall,Bert.F1,BLEU,Squad.Exact,Squad.F1
0,distilbert-base,full_combined,0.918825,0.916122,0.917312,0.15178,27.464789,31.702372
1,distilbert-base,smaller_combined,0.917515,0.915308,0.916265,0.111976,24.647887,29.102328
2,roberta-base,full_combined,0.931785,0.930326,0.930945,0.332324,36.619718,39.93349
3,roberta-base,smaller_combined,0.91991,0.921609,0.920613,0.247263,28.873239,31.289683


## Base DPR
![image-2.png](attachment:image-2.png)