In [None]:
# Import libraries
import evaluate
from datasets import *
import numpy as np
import pathlib as pl
import pandas as pd
import torch
import ast
import os
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, TransformersReader, Seq2SeqGenerator
from haystack.nodes.answer_generator.transformers import _BartEli5Converter
from haystack.pipelines import Pipeline

# Year identifier
YEAR = 2022

# Standard model path
MODEL_PATH_B = "../../models/BERT"
MODEL_PATH_T = "../../models/T5"

In [None]:
written_data = load_dataset('csv', data_files=f"../../data/clean/squad/QA_SR_2022_Expert-squad-format.csv",
                        delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=42)
# Reformat the train and test set such as they adhere to the SQuAD format (reading from cvs loads strings not objects as expected)
written_data["test"] = written_data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
# Replace all "\n" with " " in the context, answers and questions
written_data["test"] = written_data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
written_data["test"] = written_data["test"].map(lambda example: {"question": example["question"].replace("\n", " "), "context": example["context"].replace("\n", " "), "answers": {
                                "text": [example["answers"]["text"][0].replace("\n", " ")], "answer_start": example["answers"]["answer_start"]}})
written_data["test"].remove_columns(["text", "answer_start"])

In [None]:
# Load document Store
document_store = FAISSDocumentStore.load(index_path="document_store.faiss", config_path="document_store.json")

In [None]:
# Load the fine-tuned retriever
retriever = DensePassageRetriever.load(load_dir=f"../../models/DPR/{YEAR}", document_store=document_store, use_gpu=True)

In [None]:
# Set the device
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define the prediction function
def inference_answer(model, question):
    question = question
    with torch.no_grad():
        outputs = model.run(query=question, params={"Model": {"top_k": 1}, "Retriever": {"top_k": 1}})
    return outputs["answers"][0].answer, outputs["documents"][0].content

In [None]:
# Boilerplate results
results = {
    "Model": [],
    "Question": [],
    "Ground Truth Context": [],
    "Ground Truth Answer": [],
    "Retrieved Context": [],
    "Extracted/Generated Answer": []
}

In [None]:
reader = TransformersReader(model_name_or_path=f"{MODEL_PATH_B}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2022-handwritten", use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=reader, name="Model", inputs=["Retriever"])

for idx in range(written_data["test"].shape[0]):
    results["Model"].append("distilbert-base-handwritten")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del reader
del pipe
torch.cuda.empty_cache()

In [None]:
reader = TransformersReader(model_name_or_path=f"{MODEL_PATH_B}/distilbert-base-cased-distilled-squad-finetuned-NLB-QA-2042-full_combined", use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=reader, name="Model", inputs=["Retriever"])

for idx in range(written_data["test"].shape[0]):
    results["Model"].append("distilbert-base-full_combined")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del reader
del pipe
torch.cuda.empty_cache()

In [None]:
reader = TransformersReader(model_name_or_path=f"{MODEL_PATH_B}/roberta-base-squad2-finetuned-NLB-QA-2022-handwritten", use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=reader, name="Model", inputs=["Retriever"])



for idx in range(written_data["test"].shape[0]):
    results["Model"].append("roberta-base-handwritten")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del reader
del pipe
torch.cuda.empty_cache()

In [None]:
reader = TransformersReader(model_name_or_path=f"{MODEL_PATH_B}/roberta-base-squad2-finetuned-NLB-QA-2042-full_combined", use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=reader, name="Model", inputs=["Retriever"])



for idx in range(written_data["test"].shape[0]):
    results["Model"].append("roberta-base-full_combined")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del reader
del pipe
torch.cuda.empty_cache()

In [None]:
generator = Seq2SeqGenerator(model_name_or_path=f"{MODEL_PATH_T}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2022-handwritten", input_converter=_BartEli5Converter(), use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=generator, name="Model", inputs=["Retriever"])



for idx in range(written_data["test"].shape[0]):
    results["Model"].append("t5-base-handwritten")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del generator
del pipe
torch.cuda.empty_cache()

In [None]:
generator = Seq2SeqGenerator(model_name_or_path=f"{MODEL_PATH_T}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2022-handwritten", input_converter=_BartEli5Converter(), use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=generator, name="Model", inputs=["Retriever"])



for idx in range(written_data["test"].shape[0]):
    results["Model"].append("t5-small-handwritten")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del generator
del pipe
torch.cuda.empty_cache()

In [None]:
generator = Seq2SeqGenerator(model_name_or_path=f"{MODEL_PATH_T}/t5-base-finetuned-squadv2-finetuned-NLB-QA-2042-full_combined", input_converter=_BartEli5Converter(), use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=generator, name="Model", inputs=["Retriever"])



for idx in range(written_data["test"].shape[0]):
    results["Model"].append("t5-base-full_combined")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del generator
del pipe
torch.cuda.empty_cache()

In [None]:
generator = Seq2SeqGenerator(model_name_or_path=f"{MODEL_PATH_T}/t5-small-finetuned-squadv2-finetuned-NLB-QA-2042-full_combined", input_converter=_BartEli5Converter(), use_gpu=True)
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=generator, name="Model", inputs=["Retriever"])



for idx in range(written_data["test"].shape[0]):
    results["Model"].append("t5-small-full_combined")
    row = written_data["test"][idx]
    results["Question"].append(row["question"].strip())
    results["Ground Truth Context"].append(row["context"])
    results["Ground Truth Answer"].append(row["answers"]["text"][0])
    gotten_answer, gotten_context = inference_answer(pipe, row["question"].strip())
    results["Retrieved Context"].append(gotten_context)
    results["Extracted/Generated Answer"].append(gotten_answer)

del generator
del pipe
torch.cuda.empty_cache()

In [None]:
results_df = pd.DataFrame(results)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(results_df)

In [None]:
# split the dataset bz model
for model in results_df["Model"].unique():
    model_df = results_df[results_df["Model"] == model]
    model_df.to_csv(f"../../data/results/2022-handwritten/{model}-outputs.csv", index=False, sep=";")