In [None]:
# https://programmerbackpack.com/bert-nlp-using-distilbert-to-build-a-question-answering-system/

In [1]:
from question_processor import QuestionProcessor
from text_extractor import TextExtractor
from text_extractor_pipe import TextExtractorPipe
from context_retriever import ContextRetriever
from answer_retriever import AnswerRetriever
import spacy

In [2]:
textExtractor1 = TextExtractor("London", "Q84")
textExtractor1.extract()
textExtractor2 = TextExtractor("Berlin", "Q64")
textExtractor2.extract()
textExtractor3 = TextExtractor("Bucharest", "Q19660")
textExtractor3.extract()

textExtractorPipe = TextExtractorPipe()
textExtractorPipe.addTextExtractor(textExtractor1)
textExtractorPipe.addTextExtractor(textExtractor2)
textExtractorPipe.addTextExtractor(textExtractor3)

nlp = spacy.load('en_core_web_sm')

nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(textExtractorPipe.extract())
sentences = [sent.string.strip() for sent in doc.sents]
questionProcessor = QuestionProcessor(nlp)
contextRetriever = ContextRetriever(nlp, 10)
answerRetriever = AnswerRetriever()

originalQuestion = "What is the capital city of Romania?"
questionContext = contextRetriever.getContext(sentences, questionProcessor.process(originalQuestion))
print ("originalQuestion-----------",originalQuestion)
print("questionContext*****************",questionContext)
print (questionProcessor.process(originalQuestion))
answer = answerRetriever.getAnswer(originalQuestion, questionContext)
print (answer)

originalQuestion----------- What is the capital city of Romania?
questionContext***************** None
capital city Romania
encodings_____________________ {'input_ids': [101, 2054, 2003, 1996, 3007, 2103, 1997, 6339, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
input_ids______________________ [101, 2054, 2003, 1996, 3007, 2103, 1997, 6339, 1029, 102]
scoresStart--------- start_logits
scoresEnd------- end_logits
torch.tensor([inputIds]---------------------------- tensor([[ 101, 2054, 2003, 1996, 3007, 2103, 1997, 6339, 1029,  102]])
torch.tensor([attentionMask])----------------------- tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


TypeError: argmax(): argument 'input' (position 1) must be Tensor, not str

In [None]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering


In [None]:
class AnswerRetriever:

    def getAnswer(self, question, questionContext):
        distilBertTokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=True)
        distilBertForQuestionAnswering = DistilBertForQuestionAnswering.from_pretrained(
            'distilbert-base-uncased-distilled-squad')

        encodings = distilBertTokenizer.encode_plus(question, questionContext)
        print("encodings_____________________",encodings)
        print("input_ids______________________",encodings["input_ids"])
        print("attention_mask__________________",encodings["attention_mask"])

        inputIds, attentionMask = encodings["input_ids"], encodings["attention_mask"]
        distilBertTokenizer

        scoresStart, scoresEnd = distilBertForQuestionAnswering(torch.tensor([inputIds]),attention_mask=torch.tensor([attentionMask]))
        print("torch.tensor([inputIds]----------------------------",torch.tensor([inputIds]))
        print("torch.tensor([attentionMask])-----------------------",torch.tensor([attentionMask]))
        print("attention_mask--------------------",attention_mask)

        tokens = inputIds[torch.argmax(scoresStart): torch.argmax(scoresEnd) + 1]
        answerTokens = distilBertTokenizer.convert_ids_to_tokens(tokens, skip_special_tokens=True)
        return distilBertTokenizer.convert_tokens_to_string(answerTokens)