In [None]:
# installing haystack
! pip install git+https://github.com/deepset-ai/haystack.git
    
# Installing Elasticsearch
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.6.2

In [None]:
# General libraries
import re, os, string, random, requests
import pandas as pd
from subprocess import Popen, PIPE, STDOUT
from tqdm import tqdm

# Haystack importings
from haystack import Finder
from haystack.reader.farm import FARMReader
from haystack.utils import print_answers
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

In [None]:
# Starting ElasticSearch server as daemon
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )

# wait until ElasticSearch has started
! sleep 30

In [None]:
def get_index(n):
    """Return a random string of length n"""
    letters = string.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(n))
    return result_str

In [None]:
def trim_doc(doc):
    """Trim doc with respect to the boundary of a sentence."""
    
    trimmedText = []
    charCount = 0
    for sentence in doc.split('.'):
        if charCount < DOC_THRESHOLD:
            charCount+=len(sentence.strip())
            trimmedText.append(sentence)

    finalText = ".".join(trimmedText)
    
    return finalText

In [None]:
def clean_text(text):
    """Doc cleaning"""
    
    # Lowering text
    text = text.lower()
    
    # Removing punctuation
    text = "".join([c for c in text if c not in PUNCTUATION])
    
    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)
    
    # Trimming doc
    text = trim_doc(text)
    return text

In [None]:
# Constants
ES_INDEX = get_index(10) # Elastic Search DB index name
PUNCTUATION = """!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~""" # excluding . (full-stop) from the set of punctuations
DOC_THRESHOLD = 10000 # character limit for a doc
TOP_K_RETRIEVER = 10 # top k documents to analyze further for a given query
TOP_K_READER = 5 # top k number of answers to return
BASE_URL = "http://localhost:9200/"+ES_INDEX+"/_doc/"

In [None]:
data = pd.read_csv("/kaggle/input/nips-papers-1987-2019-updated/papers.csv")
data.head()

In [None]:
data.shape

In [None]:
data.dropna(subset=['full_text'], inplace=True)
data.shape

In [None]:
# Structuring data to haystack required format
# Format: [{'text': 'paper_content', 'meta':{'name':'title'}}]
docs = []
corpora = []
doc_len = []

for index, row in tqdm(data.iterrows()):
    dicts = {}
    dicts['text'] = clean_text(row['full_text'])
    doc_len.append(len(dicts['text']))
    corpora.append(dicts['text'])
    dicts['meta'] = {}
    dicts['meta']['name'] = clean_text(row['title'])
    docs.append(dicts)

In [None]:
# Average characters in a document after trimming
sum(doc_len)/len(docs)

In [None]:
# Be careful while overwriting data on the same ES index
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index=ES_INDEX)

In [None]:
# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)

In [None]:
# Instantiating ES retriever 
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
# Initializing reader on the top of roberta-base-squad2 pre-trained model, which will be downloaded on the first run
# Here, we can set the size of context window for our answers and use the GPU if available

reader = FARMReader(model_name_or_path="ahotrod/albert_xxlargev1_squad2_512",use_gpu=True, context_window_size=500)

In [None]:
# Fitting reader and retriever to Finder
finder = Finder(reader, retriever)

In [None]:
# Question prediction with TOP_K_RETRIEVER and TOP_K_READER
question = "What is the use of CNN?"
prediction = finder.get_answers(question=question, top_k_retriever=TOP_K_RETRIEVER, top_k_reader=TOP_K_READER)

In [None]:
# Printing answers with minimal detail
# details = minimal | medium | all

print_answers(prediction, details="minimal")

In [None]:
pd.DataFrame(prediction['answers'])

## Amazing 🔥
The question was, **What is the use of CNN?**.

As we all know, the CNN (ConvNet/Convolutional Neural Network) algorithm deals with the image data. It looks like our QA system has answered our query very well. Yayyy!! 🥳 🎉🎉🎉
![](https://media.tenor.com/images/5a2d3ba3504d3f48da005d9fe6b52110/tenor.gif)

Don't forget to upvote the notebook, if you like my work. Let me know your feedback in the comment section below. 😊

### #StaySafe