In [2]:
# import dependencies

from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers
from transformers import AutoTokenizer, AutoModel


In [3]:
# connect to ES, create indexes
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

# to delete created indexes later on use:
# curl -XDELETE localhost:9200/label
# curl -XDELETE localhost:9200/document


03/13/2021 13:48:47 - INFO - elasticsearch -   HEAD http://localhost:9200/document [status:200 request:0.008s]
03/13/2021 13:48:47 - INFO - elasticsearch -   GET http://localhost:9200/document [status:200 request:0.002s]
03/13/2021 13:48:47 - INFO - elasticsearch -   PUT http://localhost:9200/document/_mapping [status:200 request:0.012s]
03/13/2021 13:48:47 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.004s]


In [4]:
# add data to ES

from lxml import etree
from lxml.etree import tostring

input_file_path = "data/Wikipedia-Strength-Training.xml"


tree = etree.parse(input_file_path)
root = tree.getroot()
# iterate through all the titles

document_dictionaries = []
for text_node in root.findall(".//text", namespaces=root.nsmap)[:20]:
    document_dictionaries.append({'text': text_node.text, 'meta': None})

document_store.write_documents(document_dictionaries)    


03/13/2021 13:48:49 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.448s]


In [5]:
# initialize retriever
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

luke_tokenizer = AutoTokenizer.from_pretrained("nielsr/luke-large")

# luke_model = AutoModel.from_pretrained("nielsr/luke-large")

KeyError: 'luke'

In [6]:
# initialize reader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


03/13/2021 13:37:46 - INFO - farm.utils -   Using device: CPU 
03/13/2021 13:37:46 - INFO - farm.utils -   Number of GPUs: 0
03/13/2021 13:37:46 - INFO - farm.utils -   Distributed Training: False
03/13/2021 13:37:46 - INFO - farm.utils -   Automatic Mixed Precision: None
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
03/13/2021 13:37:56 - INFO - farm.utils -   Using device: CPU 
03/13/2021 13:37:56 - INFO - farm.utils -   Number of GPUs: 0
03/13/2021 13:37:56 - INFO - farm.utils -   Distributed Training: False
03/13/2021 13:37:56 - INFO - farm.utils -   Automatic Mixed Precision: None
03/13/2021 13:37:56 - INFO - farm.infer -   Got ya 11 parallel workers to do inference ...
03/13/2021 13:37:56 - INFO - farm.infer -    0    0    0 

In [7]:
# create pipeline

from haystack.pipeline import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, retriever)

In [8]:
# make predictions
prediction1 = pipe.run(query="What is stamina?", top_k_retriever=10, top_k_reader=5)

prediction2 = pipe.run(query="How to avoid injury?", top_k_retriever=10, top_k_reader=5)


03/13/2021 13:38:03 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.127s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.84s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.63s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.67s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.67s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.76s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 2/2 [00:15<00:00,  7.81s/ Batches]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.89s/ Batches]
Inferencing Samples: 100%|███████████████

In [9]:
# get answers
print("Question 'What is stamina?'")
print_answers(prediction1, details="minimal")
print("Question 'How to avoid injury?'")
print_answers(prediction2, details="minimal")



Question 'What is stamina?'
[   {   'answer': 'personality trait',
        'context': '2-51, Washington DC.</ref>\n'
                   '\n'
                   'Endurance may also refer to an [[Grit (personality '
                   'trait)|ability to persevere through a difficult '
                   'situation]].\n'
                   '\n'
                   '== Traini'},
    {   'answer': 'personality trait',
        'context': '2-51, Washington DC.</ref>\n'
                   '\n'
                   'Endurance may also refer to an [[Grit (personality '
                   'trait)|ability to persevere through a difficult '
                   'situation]].\n'
                   '\n'
                   '== Traini'},
    {   'answer': 'the ability of an [[organism]] to exert itself and remain '
                  'active for a long period of time',
        'context': 'sychological)|hardiness]]) is the ability of an '
                   '[[organism]] to exert itself and remain active for 