In [None]:
from haystack import Finder
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

In [2]:
# Connect to a locally running instance of Elasticsearch

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
# document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="ahrq", search_fields='body')
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="ahrq")

11/09/2020 14:27:04 - INFO - elasticsearch -   HEAD http://localhost:9200/ahrq [status:200 request:0.008s]
11/09/2020 14:27:04 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.001s]


## Preprocessing of documents

Haystack provides a customizable pipeline for:
 - converting files into texts
 - cleaning texts
 - splitting texts
 - writing them to a Document Store

In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch.

In [3]:
def clean_url_text(text: str) -> str:
    # get rid of multiple new lines
    while "\n\n" in text:
        text = text.replace("\n\n", "\n")

    # remove extremely short lines, combine small paragraphs into larger ones
    lines = text.split("\n")
    cleaned = []
    multi_lines = ''
    for l in lines:
        if len(l) > 100:
            multi_lines += l + '\n\t'
        if len(l) > 500:
            cleaned.append(multi_lines)
            multi_lines = ''
    
    if multi_lines: cleaned.append(multi_lines) 
    text = "\n\n".join(cleaned)

    # add paragraphs (recognized by double new line)
    # text = text.replace("\n", "\n\n")

    # remove empty paragrahps
    # text = re.sub(r"(==.*==\n\n\n)", "", text)

    return text

In [4]:
from elasticsearch import Elasticsearch
import json
# Loop through ES and save all docs as txt files so we can use convert_files_to_dict funciton

host = 'localhost'
port = 9200
client = Elasticsearch(f'{host}:{port}')
index = 'ahrq_annotated'

match_all = {
    "size": 100,
    "query": {
        "match_all": {}
    }
}

resp = client.search(
    index = index,
    body = match_all,
    scroll = '2s' # length of time to keep search context
)

old_scroll_id = resp['_scroll_id']

dicts = []
while len(resp['hits']['hits']):
    resp = client.scroll(
        scroll_id = old_scroll_id,
        scroll = '2s' # length of time to keep search context
    )

    # check if there's a new scroll ID
    if old_scroll_id != resp['_scroll_id']:
        print ("NEW SCROLL ID:", resp['_scroll_id'])

    # keep track of pass scroll _id
    old_scroll_id = resp['_scroll_id']
    
    for doc in resp['hits']['hits']:
        del doc['_source']['tags']
        dicts.append(doc['_source'])


    
    
    
# Save docs as txt files
doc_dir = "data/ahrq_txt_files/"
import os
if not os.path.exists(doc_dir):
    os.makedirs(doc_dir)
    
    for d in dicts:
        with open(doc_dir + d['url'][8:].replace('/', '_') + '.txt', 'w', encoding="utf-8") as f:
            f.write(d['body'])

11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/ahrq_annotated/_search?scroll=2s [status:200 request:0.023s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.010s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.011s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.009s]
11/05/2020 10:20:56 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.009s]
11/05/2020 10:20:57 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.006s]
11/05/2020 10:20:57 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.011s]
11/05/2020 10:20:57 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [sta

11/05/2020 10:21:00 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.006s]
11/05/2020 10:21:00 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.005s]
11/05/2020 10:21:00 - INFO - elasticsearch -   POST http://localhost:9200/_search/scroll?scroll=2s [status:200 request:0.001s]


In [5]:
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_url_text, split_paragraphs=True)
print(dicts[:3])

[{'text': 'In 2016, AHRQ launched a program of grants and contracts aimed at helping health care providers move patient-centered outcomes research (PCOR) evidence into practice through clinical decision support (CDS). AHRQ advances the science of CDS by supporting implementers, clinicians, and technology vendors in developing CDS tools that are shareable, standards-based, publicly-available, and patient-centered. The four components are detailed below. \n\tAHRQ awarded RTI International a cooperative agreement to become the Patient-Centered Clinical Decision Support Learning Network (PCCDS-LN). The PCCDS-LN built a community of researchers, clinicians, professional societies, and others that explored and advanced patient-centered CDS. While the grant ended in 2020, many of the Learning Network’s resources continue to be available. \n\tAHRQ awarded the MITRE Corporation a contract to develop "CDS Connect," an online platform that includes a repository of CDS artifacts, an authoring tool

In [6]:
s = sum([len(d['text']) for d in dicts])
s/len(dicts)

1559.146375123992

In [7]:
# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

11/05/2020 10:22:03 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:0.889s]
11/05/2020 10:22:04 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.144s]
11/05/2020 10:22:05 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.162s]
11/05/2020 10:22:06 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.079s]
11/05/2020 10:22:07 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.118s]
11/05/2020 10:22:09 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.128s]
11/05/2020 10:22:10 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.082s]
11/05/2020 10:22:11 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.154s]


## Initalize Retriever, Reader,  & Finder

### Retriever

Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered.
They use some simple but fast algorithm.

**Here:** We use Elasticsearch's default BM25 algorithm

**Alternatives:**

- Customize the `ElasticsearchRetriever`with custom queries (e.g. boosting) and filters
- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging
- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT)
- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6)

In [3]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [4]:
# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store.

# from haystack.retriever.sparse import TfidfRetriever
# retriever = TfidfRetriever(document_store=document_store)

### Reader

A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based
on powerful, but slower deep learning models.

Haystack currently supports Readers based on the frameworks FARM and Transformers.
With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models).

**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2)

**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package)

**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy)

**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible"

#### FARMReader

In [6]:
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

11/09/2020 14:36:26 - INFO - farm.utils -   device: cuda n_gpu: 1, distributed training: False, automatic mixed precision training: None
11/09/2020 14:36:26 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
11/09/2020 14:36:35 - INFO - farm.utils -   device: cuda n_gpu: 1, distributed training: False, automatic mixed precision training: None
11/09/2020 14:36:35 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
11/09/2020 14:36:35 - INFO - farm.infer -    0    0    0    0    0    0    0 
11/09/2020 14:36:35 - INFO - farm.infer -   /w\  /w\  /w\  /w\  /w\  /w\  /w\
11/09/2020 14:36:35 - INFO - farm.infer -   /'\  / \  /'\  /'\  / \  / \  /'\
11/09/2020 14:36:35 - INFO - farm.infer -               


### Finder

The Finder sticks together reader and retriever in a pipeline to answer our actual questions. 

In [7]:
finder = Finder(reader, retriever)

## Voilà! Ask a question!

In [None]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers. 
prediction = finder.get_answers(question="Does ahrq offer minority supplements for grants?", top_k_retriever=10, top_k_reader=5)

11/09/2020 14:37:46 - INFO - elasticsearch -   POST http://localhost:9200/ahrq/_search [status:200 request:0.781s]
11/09/2020 14:37:46 - INFO - haystack.retriever.sparse -   Got 10 candidates from retriever
11/09/2020 14:37:46 - INFO - haystack.finder -   Reader is looking for detailed answer in 12711 chars ...


In [None]:
print_answers(prediction, details="medium")

In [None]:
import subprocess as sp
import os

def get_gpu_memory():
  _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]

  ACCEPTABLE_AVAILABLE_MEMORY = 1024
  COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
  memory_free_info = _output_to_list(sp.check_output(COMMAND.split()))[1:]
  memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
  print(memory_free_values)
  return memory_free_values

get_gpu_memory()