In [1]:
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

In [30]:
# Connect to a locally running instance of Elasticsearch

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
# document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="ahrq", search_fields='body')
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="ahrq")

11/10/2020 17:02:14 - INFO - elasticsearch -   HEAD http://localhost:9200/ahrq [status:200 request:0.053s]
11/10/2020 17:02:14 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.016s]


## Preprocessing of documents

Haystack provides a customizable pipeline for:
 - converting files into texts
 - cleaning texts
 - splitting texts
 - writing them to a Document Store

In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch.

In [4]:
def clean_url_text(text: str) -> str:
    # get rid of multiple new lines
    while "\n\n" in text:
        text = text.replace("\n\n", "\n")

    # remove extremely short lines, combine small paragraphs into larger ones
    lines = text.split("\n")
    cleaned = []
    multi_lines = ''
    for l in lines:
        if len(l) > 100:
            multi_lines += l + '\n\t'
        if len(l) > 500:
            cleaned.append(multi_lines)
            multi_lines = ''
    
    if multi_lines: cleaned.append(multi_lines) 
    text = "\n\n".join(cleaned)

    # add paragraphs (recognized by double new line)
    # text = text.replace("\n", "\n\n")

    # remove empty paragrahps
    # text = re.sub(r"(==.*==\n\n\n)", "", text)

    return text

from elasticsearch import Elasticsearch
import json
# Loop through ES and save all docs as txt files so we can use convert_files_to_dict funciton

host = 'localhost'
port = 9200
client = Elasticsearch(f'{host}:{port}')
index = 'ahrq_annotated'

match_all = {
    "size": 100,
    "query": {
        "match_all": {}
    }
}

resp = client.search(
    index = index,
    body = match_all,
    scroll = '2s' # length of time to keep search context
)

old_scroll_id = resp['_scroll_id']

dicts = []
while len(resp['hits']['hits']):
    resp = client.scroll(
        scroll_id = old_scroll_id,
        scroll = '2s' # length of time to keep search context
    )

    # check if there's a new scroll ID
    if old_scroll_id != resp['_scroll_id']:
        print ("NEW SCROLL ID:", resp['_scroll_id'])

    # keep track of pass scroll _id
    old_scroll_id = resp['_scroll_id']
    
    for doc in resp['hits']['hits']:
        del doc['_source']['tags']
        dicts.append(doc['_source'])


    
    
    
# Save docs as txt files
doc_dir = "data/ahrq_txt_files/"
import os
if not os.path.exists(doc_dir):
    os.makedirs(doc_dir)
    
    for d in dicts:
        with open(doc_dir + d['url'][8:].replace('/', '_') + '.txt', 'w', encoding="utf-8") as f:
            f.write(d['body'])
            
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_url_text, split_paragraphs=True)
print(dicts[:3])

s = sum([len(d['text']) for d in dicts])
print('average lenght of paragraph: ', s/len(dicts)

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/ahrq_annotated/_search?scroll=2s [status:200 request:0.023s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.010s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.011s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.009s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.009s]
11/05/2020 10:20:57 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.006s]
11/05/2020 10:20:57 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.011s]
11/05/2020 10:20:57 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [sta

11/05/2020 10:21:00 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.006s]
11/05/2020 10:21:00 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.005s]
11/05/2020 10:21:00 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.001s]


## Initalize Retriever, Reader,  & Finder

### Retriever

Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered.
They use some simple but fast algorithm.

**Here:** We use Elasticsearch's default BM25 algorithm

**Alternatives:**

- Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging
- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)
- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6)

### Reader

A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
on powerful, but slower deep learning models.

Haystack currently supports Readers based on the frameworks FARM and Transformers.
With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2)

**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)

**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)

**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"

#### FARMReader

### Finder

The Finder sticks together reader and retriever in a pipeline to answer our actual questions. 

In [32]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.

# from haystack.retriever.sparse import TfidfRetriever
# retriever = TfidfRetriever(document_store=document_store)


# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

finder = Finder(reader, retriever)

## Voilà! Ask a question!

In [52]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers. 
question = "What department is AHRQ a part of?"
prediction = finder.get_answers(question, top_k_retriever=10, top_k_reader=5)

print_answers(prediction, details="medium")

11/10/2020 17:37:00 - INFO - elasticsearch -   POST http://localhost:9200/ahrq/_search [status:200 request:0.014s]
11/10/2020 17:37:00 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever
11/10/2020 17:37:00 - INFO - haystack.finder -   Reader is looking for detailed answer in 11648 chars ...
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.00 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 25.17 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.49 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.67 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.49 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/

[   {   'answer': 'Health and Human Services',
        'context': 'nd Quality (AHRQ). As part of the budget for the '
                   'Department of Health and Human Services (HHS), the '
                   'performance budget submission both provides backgr',
        'score': 14.955009460449219},
    {   'answer': 'Health and Human Services',
        'context': 'nd Quality (AHRQ). As part of the budget for the '
                   'Department of Health and Human Services (HHS), the '
                   'performance budget submission both provides backgr',
        'score': 14.698270797729492},
    {   'answer': 'Health and Human Services',
        'context': 'nd Quality (AHRQ). As part of the budget for the '
                   'Department of Health and Human Services (HHS), the '
                   'performance budget submission both provides backgr',
        'score': 14.564885139465332},
    {   'answer': 'Health and Human Services',
        'context': 'nd Quality (AHRQ). As part 




In [62]:
objs = finder.retriever.retrieve(question, filters=None, top_k=10, index="ahrq")
dicts = [o.to_dict() for o in objs]
dicts[0].keys()

11/10/2020 18:15:35 - INFO - elasticsearch -   POST http://localhost:9200/ahrq/_search [status:200 request:0.014s]
11/10/2020 18:15:35 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever


[<haystack.schema.Document at 0x2b3a096d3d0>,
 <haystack.schema.Document at 0x2b3a096d370>,
 <haystack.schema.Document at 0x2b3922b3fd0>,
 <haystack.schema.Document at 0x2b3f8c52f70>,
 <haystack.schema.Document at 0x2b3f8c526d0>,
 <haystack.schema.Document at 0x2b3a0843190>,
 <haystack.schema.Document at 0x2b3a0843dc0>,
 <haystack.schema.Document at 0x2b3a08434f0>,
 <haystack.schema.Document at 0x2b3a08432e0>,
 <haystack.schema.Document at 0x2b3a08432b0>]

In [61]:
d = dicts[0:1]
reader.predict(question, objs, 1)

Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.44 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 39.99 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 37.04 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 41.65 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.48 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 43.49 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.85 Batches/s]
Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 30.95 Batches/s]
Inferencing Samples: 100%|██████████████

{'question': 'What department is AHRQ a part of?',
 'no_ans_gap': 12.131163120269775,
 'answers': [{'answer': 'Health and Human Services',
   'score': 14.955009460449219,
   'probability': 0.866386080089388,
   'context': 'nd Quality (AHRQ). As part of the budget for the Department of Health and Human Services (HHS), the performance budget submission both provides backgr',
   'offset_start': 63,
   'offset_end': 88,
   'offset_start_in_doc': 821,
   'offset_end_in_doc': 846,
   'document_id': '929e7758-0c10-43ac-bf4f-3efd4645d700'}]}

In [28]:
type(reader.inferencer.processor)

farm.data_handler.processor.SquadProcessor

type(reader.inferencer.model.language_model.model)

In [53]:
a = reader.inferencer.model.prediction_heads
print(type(a))
if len(a) > 0:
    print(a[0].aggregate_preds)
print(type(a[0]))
del a

<class 'torch.nn.modules.container.ModuleList'>
<bound method QuestionAnsweringHead.aggregate_preds of QuestionAnsweringHead(
  (feed_forward): FeedForwardBlock(
    (feed_forward): Sequential(
      (0): Linear(in_features=768, out_features=2, bias=True)
    )
  )
)>
<class 'farm.modeling.prediction_head.QuestionAnsweringHead'>


In [20]:
type(reader.inferencer.model.language_model.model)

transformers.modeling_roberta.RobertaModel

In [19]:
dir(reader.inferencer.model.language_model.model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_convert_head_mask_to_5d',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_forward_unimplemented',
 '_generate_beam_search',
 '_generate_no_beam_search',
 '_get_name',
 '_get_resized_embeddings',
 '_hook_rss_memory_post_forward',
 '_hook_rss_memory_pre_forward',
 '_init_weights',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_prune_heads',
 '_register_load_state_dict_pre_hook',
 '_register_st