# Retriever and Reader
In this notebook we implement the dense passage retriever, that returns the most probable document and passage for answering the question. The reader, on the other hand, processes the passage, in order to extract the specific answer to the question

In [1]:

from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import DensePassageRetriever, BM25Retriever, EmbeddingRetriever
import os
from haystack.nodes import FARMReader,TransformersReader
from haystack.pipelines import ExtractiveQAPipeline

---

# Retriever

<b>ds_astronomy:</b> document store with processed data 

In [2]:
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

ds_astronomy = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="ds_astronomy",
    similarity="dot_product",
    embedding_dim=768
)

curr_store = ds_astronomy
localhost = 'http://localhost:9200/ds_astronomy/_count'

In [3]:
# TF-IDF Retriever
retriever = DensePassageRetriever(
    document_store=ds_astronomy,
    query_embedding_model='facebook/dpr-question_encoder-single-nq-base',
    passage_embedding_model='facebook/dpr-ctx_encoder-single-nq-base',
    use_gpu=True,
    embed_title=True
)
ds_astronomy.update_embeddings(retriever=retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [4]:
# EmbeddingRetriever Retriever

#Worst performing
retriever = EmbeddingRetriever(
document_store=ds_astronomy,
   embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
   model_format="sentence_transformers"
)
ds_astronomy.update_embeddings(retriever=retriever)



In [18]:
# BM25 Retriever
retriever = BM25Retriever(
   ds_astronomy
)

---

<h1>Reader</h1>

In [19]:
roberta = "deepset/roberta-base-squad2"

reader = FARMReader(
    model_name_or_path=roberta, 
    use_gpu=True, 
    return_no_answer=True, 
    no_ans_boost=0, 
    top_k=2
)

---

In [20]:
pipeline = ExtractiveQAPipeline(reader, retriever)

---

<h3>Question & Awnser</h3>

In [7]:
import time

In [8]:
from transformers import PegasusForConditionalGeneration as PCG
from transformers import PegasusTokenizerFast as PTF

model = PCG.from_pretrained("tuner007/pegasus_paraphrase")
tokenizer = PTF.from_pretrained("tuner007/pegasus_paraphrase")

In [9]:
def get_paraphrased_sentences(model, tokenizer, sentence, num_return_sequences=5, num_beams=5):
  # tokenize the text to be form of a list of token IDs
  inputs = tokenizer([sentence], truncation=True, padding="longest", return_tensors="pt")
  # generate the paraphrased sentences
  outputs = model.generate(
    **inputs,
    num_beams=num_beams,
    num_return_sequences=num_return_sequences,
  )
  # decode the generated sentences using the tokenizer to get them back to\ text
  list_ = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  if sentence not in list_:
    list_.insert(0, sentence)
  return list_

In [22]:
#Darck energy
q1 = 'Compared to the Sun, what is the mass of white dwarfs stars'
q2 = 'Compared to the Sun, what is the size of white dwarfs stars'
q3 = 'What are the consequences of the strong gravitational field of the white dwarf'

#Dark matter
q4 = 'What is ordinary baryonic matter'
q5 = 'How abundant is dark matter'
q6 = 'What is dark matter'

#Geophysical Classification of Planets
q7 = 'What is the element composition of rock planets?'
q8 = 'Who are the ice giants'
q9 = 'What is the composition of ice giants'

#Stars
q10 = 'What is the Orion Nebula?'
q11 = 'How many years until the collapse of our sun'
q12 = 'What is the most abundant star in our Universe'

#Supernova Stages
q13 = 'How many stages does the death of a star have'
q14 = 'What is a supernova'
q15 = 'What happens to the remains of the former star'

#Light
q16 = 'What is the whavelenght range that the human eye can see'
q17 = 'What happens during a total eclipse'
q18 = "What would happen if the Sun's surface were 3.000 ºC cooler?"

#Black Hole
q19 = 'What is a black hole?'
q20 = 'What resembles the mass of a large mountain'
q21 = "What happens when a very big star falls in upon itself?"

#Sun light
q22 = "What is the source of Earth's light"
q23 = "What did Einstein show to clue the source of stellar energy"
q24 = 'What did scientists learn about matter?'

question_list = [q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, q13, q14, q15, q16, q17, q18, q19, q20, q21, q22, q23, q24]

In [None]:
for qn in question_list:
    print(f'#### {qn}####')
    query_list = get_paraphrased_sentences(model, tokenizer, qn, num_beams=5, num_return_sequences=5)

    for question in query_list:
        start = time.time()
        result = pipeline.run(query=question)
        print('Question: ', question)
        for idx in range(len(result['answers'])):
            answer = str(result['answers'][idx])
            answer = answer.split("answer=")[1]
            answer = answer.split(", score=")[0]
            print(answer)
        end = time.time()
        print('Elapsed time: ', end - start)
        print("----------------------------------------------")