In [None]:
!pip3 install en_core_web_sm

In [None]:
!python3 -m spacy download en_core_web_sm


In [None]:
pip install wikipedia


In [None]:
pip install gensim==3.8.3

In [None]:
pip install torch==1.4.0

In [None]:
pip install transformers==2.8.0

In [None]:
pip install spacy

In [None]:
import spacy
import wikipedia
import os
from gensim.summarization.bm25 import BM25
import torch
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering


**Download relevant data for the model**


In [None]:


wikidata_list=[("visakhapatnam", "Q200016"), ("Berlin", "Q64"), ("Bucharest", "Q19660"), ("Vijayawada", "Q200017"), ("Srinagar", "Q170115")]

for i, j in wikidata_list:
  fileName = "./text/" + i + ".txt"
  if not os.path.isfile(fileName):
      page = wikipedia.page(title=i, pageid=j)
      f = open(fileName, "w")
      f.write(page.content)
      f.close()



In [None]:
all_text=''
for i, j in wikidata_list:
  f = open("./text/" + i + ".txt", "r")
  all_text+=f.read()

In [None]:
#question processor
def qp(text, nlp):
  pos = ["NOUN", "PROPN", "ADJ"]
  tokens = nlp(text)
  return ' '.join(token.text for token in tokens if token.pos_ in pos)

In [None]:
#context retriever
def cr(num, question, sentences, nlp):
  numberOfResults = num
  docs = []
  for sent in sentences:
    s=[token.lemma_ for token in nlp(sent)]
    docs.append(s)

  bm25 = BM25(docs)
  q=[token.lemma_ for token in nlp(question)]
  scores = bm25.get_scores(q)
  results = {}
  for index, score in enumerate(scores):
      results[index] = score

  sorted_results = {k: v for k, v in sorted(results.items(), key=lambda item: item[1], reverse=True)}
  results_list = list(sorted_results.keys())
  final_results = results_list if len(results_list) < numberOfResults else results_list[:numberOfResults]
  qc = ""
  for f in final_results:
      qc = qc + " ".join(docs[f])
  return qc

In [None]:
#answer retriever

def QA(question, qc, tk='distilbert-base-uncased', qaModel='distilbert-base-uncased-distilled-squad'):

  BertTokenizer = DistilBertTokenizer.from_pretrained(tk, return_token_type_ids=True)
  BertForQA = DistilBertForQuestionAnswering.from_pretrained(qaModel)

  encodings = BertTokenizer.encode_plus(question, qc)

  inputIds, attentionMask = encodings["input_ids"], encodings["attention_mask"]

  scoresStart, scoresEnd = BertForQA(torch.tensor([inputIds]), attention_mask=torch.tensor([attentionMask]))

  tokens = inputIds[torch.argmax(scoresStart): torch.argmax(scoresEnd) + 1]
  answerTokens = BertTokenizer.convert_ids_to_tokens(tokens, skip_special_tokens=True)
  return BertTokenizer.convert_tokens_to_string(answerTokens)



In [None]:

nlp = spacy.load('en_core_web_sm')

nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(all_text)
sentences = [sent.string.strip() for sent in doc.sents]


In [None]:
originalQuestion = "most spoken language"
question=qp(originalQuestion, nlp)
questionContext = cr(10, question=question, sentences=sentences, nlp=nlp)
print ("ASKED QUESTION : " + originalQuestion)
print ("PROCESSING QUESTION : " + question)
answer = QA(originalQuestion, qc=questionContext)
print ("ANSWER : " + answer)

In [None]:
originalQuestion = "what is the Capital of Romania?"
question=qp(originalQuestion, nlp)
questionContext = cr(1, question=question, sentences=sentences, nlp=nlp)
print ("ASKED QUESTION : " + originalQuestion)
print ("PROCESSING QUESTION : " + question)
answer = QA(originalQuestion, qc=questionContext)
print ("ANSWER : " + answer)