In [None]:
!pip install farm-haystack --quiet

In [2]:
from haystack import Finder
from haystack.reader.farm import FARMReader

In [3]:
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.6.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)
                  )
# wait until ElasticSearch has started
! sleep 30

In [4]:
from haystack.database.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

07/17/2020 11:59:15 - INFO - elasticsearch -   PUT http://localhost:9200/document [status:200 request:0.549s]


In [5]:
data_dir="/content/drive/My Drive/arXiv"

In [6]:
import json
import ast

def read_json_data(path):
  with open(path) as f:
    data = json.load(f)
    f.close()
    return data

def create_data_dicts(json_data):
  dicts = []
  for item in json_data:
    entry = {}
    entry['name'] = item['title']
    entry['text'] = item['summary']
    entry['url'] = ast.literal_eval(item['link'])[1]['href']
    dicts.append(entry)
  return dicts

In [7]:
jsonData = read_json_data(os.path.join(data_dir,"arxivData.json"))
esData = create_data_dicts(jsonData)
esData[:3]

[{'name': 'Dual Recurrent Attention Units for Visual Question Answering',
  'text': 'We propose an architecture for VQA which utilizes recurrent layers to\ngenerate visual and textual attention. The memory characteristic of the\nproposed recurrent attention units offers a rich joint embedding of visual and\ntextual features and enables the model to reason relations between several\nparts of the image and question. Our single model outperforms the first place\nwinner on the VQA 1.0 dataset, performs within margin to the current\nstate-of-the-art ensemble model. We also experiment with replacing attention\nmechanisms in other state-of-the-art models with our implementation and show\nincreased accuracy. In both cases, our recurrent attention mechanism improves\nperformance in tasks requiring sequential or relational reasoning on the VQA\ndataset.',
  'url': 'http://arxiv.org/pdf/1802.00209v1'},
 {'name': 'Sequential Short-Text Classification with Recurrent and Convolutional\n  Neural Netw

In [None]:
document_store.write_documents(esData)

In [9]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [11]:
finder = Finder(reader, retriever)

In [17]:
prediction = finder.get_answers(question="What are the ways to perform Named Entity Recognition?", top_k_retriever=15, top_k_reader=3)

07/17/2020 12:02:08 - INFO - elasticsearch -   POST http://localhost:9200/document/_search [status:200 request:0.029s]
07/17/2020 12:02:08 - INFO - haystack.retriever.sparse -   Got 15 candidates from retriever
07/17/2020 12:02:08 - INFO - haystack.finder -   Reader is looking for detailed answer in 13905 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.35 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 30.46 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 25.29 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 34.71 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 27.57 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 26.38 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  8.40 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 23.04 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 26.52 Batches/s]
Inferencing Samples: 100%|

In [13]:
def extract_info_from_predictions(answers):
  results = []
  predictions = answers['answers']
  for pred in predictions:
    res = {}
    res['answer'] = pred['answer']
    res['score'] = pred['score']
    res['context'] = pred['context']
    res['paper'] = pred['meta']
    results.append(res)
  return results

In [18]:
import pprint
results = extract_info_from_predictions(prediction)
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(results)

[ { 'answer': 'rule-based and machine learning approaches',
    'context': 'h particular reference to\n'
               'Assamese. There are various rule-based and machine learning '
               'approaches\n'
               'available for Named Entity Recognition. At the very f',
    'paper': { 'name': 'A Survey of Named Entity Recognition in Assamese and '
                       'other Indian\n'
                       '  Languages',
               'url': 'http://arxiv.org/pdf/1407.2918v1'},
    'score': 14.592899322509766},
  { 'answer': 'Alchemy, Zemanta\nand Rembrandt',
    'context': 'ognition (NER) for content written in Portuguese. These are '
               'Alchemy, Zemanta\n'
               'and Rembrandt. Evaluation of the efficacy of the entity '
               'extraction method',
    'paper': { 'name': 'PAMPO: using pattern matching and pos-tagging for '
                       'effective Named\n'
                       '  Entities recognition in Portuguese',
          