In [1]:
import evaluate
from datasets import *
import numpy as np
import pathlib as pl
import torch
import ast
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, TransformersReader
from haystack.pipelines import Pipeline

year = 2022

In [2]:
document_store = FAISSDocumentStore.load(index_path="../app/document_store.faiss", config_path="../app/document_store.json")

In [3]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True
)

  return self.fget.__get__(instance, owner)()
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [11]:
reader = TransformersReader(model_name_or_path=f"..\initial_moddeling\distilbert-qa\distilbert-nlb-qa-{year}", use_gpu=True)

  return self.fget.__get__(instance, owner)()


In [12]:
pipe = Pipeline()
pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=reader, name="Model", inputs=["Retriever"])

In [13]:
data = load_dataset('csv', data_files=f"../data/clean/sustainability-report-{year}-squad-format.csv",
                    delimiter=";", split='train').train_test_split(test_size=0.3, shuffle=True, seed=42)

data["test"] = data["test"].map(
    lambda example: ast.literal_eval(example["answers"]))
data["test"] = data["test"].map(lambda example: {"question": example["question"], "context": example["context"], "answers": {
                                  "text": example["text"], "answer_start": example["answer_start"]}})
data["test"].remove_columns(["text", "answer_start"])

Found cached dataset csv (C:/Users/rjutr/.cache/huggingface/datasets/csv/default-e3048f1bd60b5c4e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Loading cached split indices for dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e96c5bd318352c3f.arrow and C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-3f782815aab69336.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-0e58ef57ff25bb58.arrow
Loading cached processed dataset at C:\Users\rjutr\.cache\huggingface\datasets\csv\default-e3048f1bd60b5c4e\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-944b27bfd97247d4.arrow


Dataset({
    features: ['index', 'question', 'context', 'answers', 'id'],
    num_rows: 107
})

In [14]:
# Set the device
device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")

In [15]:
# Define the prediction function
def inference_answer(model, question):
    question = question
    with torch.no_grad():
        outputs = model.run(query=question, params={"Model": {"top_k": 3}})
    return outputs["answers"][0].answer

In [16]:
answer_pred = [inference_answer(pipe, data["test"]["question"][idx]) for idx in range(data["test"].shape[0])]



In [17]:
answer_true = [data["test"]["answers"][idx]["text"][0]
               for idx in range(data["test"].shape[0])]

In [12]:
bertscore = evaluate.load("bertscore")
squad_v2_metric = evaluate.load("squad_v2")
bleu = evaluate.load("bleu")

In [13]:
results = bertscore.compute(predictions=answer_pred,
                            references=answer_true, lang="en")
# Embeddings bases evaluation
print(
    f"F1: {np.array(results['f1']).mean()}, Precision: {np.array(results['precision']).mean()}, Recall: {np.array(results['recall']).mean()}")

  return self.fget.__get__(instance, owner)()


F1: 0.9175579792985292, Precision: 0.9185575748158392, Recall: 0.9168313507721803


In [14]:
references = [{"answers": {"answer_start": [answer["answer_start"][0]], "text": [
    answer["text"][0]]}, "id": str(id)} for id, answer in zip(data["test"]["id"], data["test"]["answers"])]
predictions = [{"id": str(id), "prediction_text": answer, "no_answer_probability": 0.}
               for id, answer in zip(data["test"]["id"], answer_pred)]
results = squad_v2_metric.compute(
    predictions=predictions, references=references)
results

{'exact': 31.77570093457944,
 'f1': 36.37870436935857,
 'total': 107,
 'HasAns_exact': 31.77570093457944,
 'HasAns_f1': 36.37870436935857,
 'HasAns_total': 107,
 'best_exact': 31.77570093457944,
 'best_exact_thresh': 0.0,
 'best_f1': 36.37870436935857,
 'best_f1_thresh': 0.0}

In [15]:
references = [[answer] for answer in answer_true]
# N-Gram based evaluation
results = bleu.compute(predictions=answer_pred, references=references)
results

{'bleu': 0.16736303030434813,
 'precisions': [0.31225296442687744,
  0.1917808219178082,
  0.14606741573033707,
  0.12903225806451613],
 'brevity_penalty': 0.9131007162822622,
 'length_ratio': 0.9166666666666666,
 'translation_length': 253,
 'reference_length': 276}