In [None]:
# Import libraries
import evaluate
from datasets import *
import numpy as np
import pathlib as pl
import pandas as pd
import torch
import ast
import os
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, Seq2SeqGenerator
from haystack.nodes.answer_generator.transformers import _BartEli5Converter
from haystack.pipelines import Pipeline

# Year identifier
YEAR = 2042

# Standard model path
MODEL_PATH = "../../models/T5"

In [None]:
generated_data = load_dataset('csv', data_files=f"../../data/clean/squad/sustainability-report-{YEAR}-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=42)
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
generated_data["test"] = generated_data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
# Replace all "\n" with " " in the context, answers and questions
generated_data["test"] = generated_data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
generated_data["test"] = generated_data["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
generated_data["test"].remove_columns(["text", "answer_start"])

In [None]:
# Load metrics
bertscore = evaluate.load("bertscore")
bleu = evaluate.load("bleu")

In [None]:
# Load document Store
document_store = FAISSDocumentStore.load(index_path="document_store.faiss", config_path="document_store.json")

In [None]:
# Load non fine-tuned retriever
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True
)

In [None]:
# Set the device
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define the prediction function
def inference_answer(model, question):
    question = question
    with torch.no_grad():
        outputs = model.run(query=question, params={"Model": {"top_k": 3}})
    return outputs["answers"][0].answer

In [None]:
# Get the model names
models = os.listdir(MODEL_PATH)

In [None]:
# Boilerplate results
results = {
    "Model": [],
    "DPR-ft": [],
    "Data": [],
    "Bert.Precision": [],
    "Bert.Recall": [],
    "Bert.F1": [],
    "BLEU": []
}

In [None]:
# Get ground truths
generated_answer_true = [generated_data["test"]["answers"][idx]["text"][0] for idx in range(generated_data["test"].shape[0])]

In [None]:
generator = None
pipe = None

for model in models:
    model_data = model.split("-")
    year = model_data[-2]

    if int(year) != YEAR:
        continue

    print(f"Evaluating model {model}...")
    model_name = "-".join(model_data[:2])
    data_name = model_data[-1]

    generator = Seq2SeqGenerator(model_name_or_path=f"{MODEL_PATH}/{model}", input_converter=_BartEli5Converter(), use_gpu=True)
    pipe = Pipeline()
    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
    pipe.add_node(component=generator, name="Model", inputs=["Retriever"])

    results["Model"].append(model_name)
    results["DPR-ft"].append(False)
    results["Data"].append(data_name)

    answer_pred = [inference_answer(pipe, generated_data["test"]["question"][idx]) for idx in range(generated_data["test"].shape[0])]

    results_1 = bertscore.compute(predictions=answer_pred, references=generated_answer_true, lang="en")
    results_2 = bleu.compute(predictions=answer_pred, references=generated_answer_true)

    results["Bert.Precision"].append(np.array(results_1["precision"]).mean())
    results["Bert.Recall"].append(np.array(results_1["recall"]).mean())
    results["Bert.F1"].append(np.array(results_1["f1"]).mean())
    results["BLEU"].append(results_2["bleu"])

    del generator
    del pipe
    torch.cuda.empty_cache()

In [None]:
# Clear cache
del retriever
torch.cuda.empty_cache()

# Load the fine-tuned retriever
retriever = DensePassageRetriever.load(load_dir=f"../../models/DPR/{YEAR}", document_store=document_store, use_gpu=True)

In [None]:
generator = None
pipe = None

for model in models:
    model_data = model.split("-")
    year = model_data[-2]

    if int(year) != YEAR:
        continue

    print(f"Evaluating model {model}...")
    model_name = "-".join(model_data[:2])
    data_name = model_data[-1]

    generator = Seq2SeqGenerator(model_name_or_path=f"{MODEL_PATH}/{model}", input_converter=_BartEli5Converter(), use_gpu=True)
    pipe = Pipeline()
    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
    pipe.add_node(component=generator, name="Model", inputs=["Retriever"])

    results["Model"].append(model_name)
    results["DPR-ft"].append(True)
    results["Data"].append(data_name)

    answer_pred = [inference_answer(pipe, generated_data["test"]["question"][idx]) for idx in range(generated_data["test"].shape[0])]

    results_1 = bertscore.compute(predictions=answer_pred, references=generated_answer_true, lang="en")
    results_2 = bleu.compute(predictions=answer_pred, references=generated_answer_true)

    results["Bert.Precision"].append(np.array(results_1["precision"]).mean())
    results["Bert.Recall"].append(np.array(results_1["recall"]).mean())
    results["Bert.F1"].append(np.array(results_1["f1"]).mean())
    results["BLEU"].append(results_2["bleu"])

    del generator
    del pipe
    torch.cuda.empty_cache()

In [None]:
results_df = pd.DataFrame(results)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(results_df)